Date: (Wed) Oct 19, 2016
Data: Source: Training: http://catalog.data.gov/dataset/consumer-complaint-database
New: None
Time period:
Based on analysis utilizing <> techniques,
US Federal Government: Bureau of Consumer Financial Protection (CFPB): Consumer Complaint Database: Metadata Updated: Sep 26, 2015
The observations are complaints CFPB received about financial products and services from US residents. The objective is to create a classifier that predicts if a specific customer will dispute a complaint with a bank: Consumer Dispute? (Yes or No).
Display ggtile plot of Test data frame observations
Display ggtile plot of Validation data frame observations
Display AUC Curve
Display classifier threshold selection plot
[](
Summary of key steps & error improvement stats:
Product: Include all & create product groups to minimize model stacks
Ensemble models
rm(list = ls())
set.seed(12345)
options(stringsAsFactors = FALSE)
source("~/Dropbox/datascience/R/mycaret.R")
source("~/Dropbox/datascience/R/mypetrinet.R")
source("~/Dropbox/datascience/R/myplclust.R")
source("~/Dropbox/datascience/R/myplot.R")
source("~/Dropbox/datascience/R/myscript.R")
source("~/Dropbox/datascience/R/mytm.R")
if (is.null(knitr::opts_current$get(name = 'label'))) # Running in IDE
debugSource("~/Dropbox/datascience/R/mydsutils.R") else
source("~/Dropbox/datascience/R/mydsutils.R")
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loading required package: caret
## Loading required package: lattice
# Gather all package requirements here
suppressPackageStartupMessages(require(doMC))
glbCores <- 10 # of cores on machine - 2
registerDoMC(glbCores)
suppressPackageStartupMessages(require(caret))
require(plyr)
## Loading required package: plyr
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(knitr)
## Loading required package: knitr
require(stringr)
## Loading required package: stringr
#source("dbgcaret.R")
#packageVersion("snow")
#require(sos); findFn("complete", maxPages=2, sortby="MaxScore")
# Analysis control global variables
# Inputs
# url/name = "<PathPointer>"; if url specifies a zip file, name = "<filename>";
# or named collection of <PathPointer>s
# sep = choose from c(NULL, "\t")
#glbObsTrnFile <- list(url = "http://catalog.data.gov/dataset/consumer-complaint-database"
glbObsTrnFile <- list(name = "Consumer_Complaints.csv"
# or list(url = c(NULL, <.inp1> = "<path1>", <.inp2> = "<path2>"))
, splitSpecs = list(method = "condition" # default when glbObsNewFile is NULL
# select from c("copy", NULL ???, "condition", "sample", )
# ,nRatio = 0.3 # > 0 && < 1 if method == "sample"
# ,seed = 123 # any integer or glbObsTrnPartitionSeed if method == "sample"
,condition = 'Consumerdisputed == ""'# or 'is.na(<var>)'; '<var> <condition_operator> <value>'
)
)
glbObsNewFile <- NULL # default OR list(url = "None")
#sort(table(glbObsTrn$Product, useNA = "ifany"))
#sort(table(glbObsAll$Product, useNA = "ifany"))
glbObsDropCondition <- #NULL # : default
# enclose in single-quotes b/c condition might include double qoutes
# use | & ; NOT || &&
# '<condition>'
'(glbObsAll[, "Product"] %in% c("Mortgage", "Debt collection", "Credit reporting", "Credit card", "Bank account or service", NULL, "Payday loan", "Money transfers", "Prepaid card", "Other financial service", "Virtual currency")) '
# 'grepl("^First Draft Video:", glbObsAll$Headline)'
# 'is.na(glbObsAll[, glb_rsp_var_raw])'
# '(is.na(glbObsAll[, glb_rsp_var_raw]) & grepl("Train", glbObsAll[, glbFeatsId]))'
# 'is.na(strptime(glbObsAll[, "Date"], glbFeatsDateTime[["Date"]]["format"], tz = glbFeatsDateTime[["Date"]]["timezone"]))'
# '(is.na(glbObsAll[, "Q109244"]) | (glbObsAll[, "Q109244"] != "No"))'
#nrow(do.call("subset",list(glbObsAll, parse(text=paste0("!(", glbObsDropCondition, ")")))))
glb_obs_repartition_train_condition <- NULL # : default
# "<condition>"
glb_max_fitobs <- NULL # or any integer
glbObsTrnPartitionSeed <- 123 # or any integer
glb_is_regression <- FALSE; glb_is_classification <- !glb_is_regression;
glb_is_binomial <- TRUE # or TRUE or FALSE
glb_rsp_var_raw <- "Consumerdisputed"
# for classification, the response variable has to be a factor
glb_rsp_var <- "CDisputed.fctr"
# if the response factor is based on numbers/logicals e.g (0/1 OR TRUE/FALSE vs. "A"/"B"),
# or contains spaces (e.g. "Not in Labor Force")
# caret predict(..., type="prob") crashes
glb_map_rsp_raw_to_var <- #NULL
function(raw) {
# return(raw ^ 0.5)
# return(log(raw))
# return(log(1 + raw))
# return(log10(raw))
# return(exp(-raw / 2))
#
# chk ref value against frequencies vs. alpha sort order
ret_vals <- rep_len(NA, length(raw));
ret_vals[raw != ""] <- ifelse(raw[raw != ""] == "Yes", "Y", "N");
return(relevel(as.factor(ret_vals), ref="N"))
#
# as.factor(paste0("B", raw))
# as.factor(gsub(" ", "\\.", raw))
}
#if glb_rsp_var_raw is numeric:
#print(summary(glbObsAll[, glb_rsp_var_raw]))
#glb_map_rsp_raw_to_var(tst <- c(NA, as.numeric(summary(glbObsAll[, glb_rsp_var_raw]))))
#if glb_rsp_var_raw is character:
#print(table(glbObsAll[, glb_rsp_var_raw], useNA = "ifany"))
#print(table(glb_map_rsp_raw_to_var(tst <- glbObsAll[, glb_rsp_var_raw]), useNA = "ifany"))
glb_map_rsp_var_to_raw <- #NULL
function(var) {
# return(var ^ 2.0)
# return(exp(var))
# return(10 ^ var)
# return(-log(var) * 2)
# as.numeric(var)
levels(var)[as.numeric(var)]
# sapply(levels(var)[as.numeric(var)], function(elm)
# if (is.na(elm)) return(elm) else
# if (elm == 'R') return("Republican") else
# if (elm == 'D') return("Democrat") else
# stop("glb_map_rsp_var_to_raw: unexpected value: ", elm)
# )
# gsub("\\.", " ", levels(var)[as.numeric(var)])
# c("<=50K", " >50K")[as.numeric(var)]
# c(FALSE, TRUE)[as.numeric(var)]
}
#print(table(glb_map_rsp_var_to_raw(glb_map_rsp_raw_to_var(tst)), useNA = "ifany"))
if ((glb_rsp_var != glb_rsp_var_raw) && is.null(glb_map_rsp_raw_to_var))
stop("glb_map_rsp_raw_to_var function expected")
# List info gathered for various columns
# <col_name>: <description>; <notes>
# "Datereceived":
# "Product":
# "Subproduct":
# "Issue":
# "Subissue":
# "Consumercomplaintnarrative":
# "Companypublicresponse":
# "Company":
# "State":
# "ZIPcode":
# "Tags":
# "Consumerconsentprovided":
# "Submittedvia":
# "Datesenttocompany":
# "Companyresponsetoconsumer":
# "Timelyresponse":
# "Consumerdisputed": "", "No", "Yes"; glb_rsp_var_raw
# "ComplaintID": glbFeatsId
# currently does not handle more than 1 column; consider concatenating multiple columns
# If glbFeatsId == NULL, ".rownames <- as.numeric(row.names())" is the default
glbFeatsId <- "ComplaintID" # choose from c(NULL : default, "<id_feat>")
glbFeatsCategory <- NULL # choose from c(NULL : default, "<category_feat>")
# User-specified exclusions
glbFeatsExclude <- c(NULL
# Feats that shd be excluded due to known causation by prediction variable
# , "<feat1", "<feat2>"
# Feats that are factors with unique values (as % of nObs) > 49 (empirically derived)
# Feats that are linear combinations (alias in glm)
# Feature-engineering phase -> start by excluding all features except id & category &
# work each one in
, "Datereceived"
, "Product"
, "Subproduct"
, "Issue"
, "Subissue"
, "Consumercomplaintnarrative"
, "Companypublicresponse"
, "Company"
, "State"
, "ZIPcode"
, "Tags"
, "Consumerconsentprovided"
, "Submittedvia"
, "Datesenttocompany"
, "Companyresponsetoconsumer"
#, "Timelyresponse"
)
if (glb_rsp_var_raw != glb_rsp_var)
glbFeatsExclude <- union(glbFeatsExclude, glb_rsp_var_raw)
glbFeatsInteractionOnly <- list()
#glbFeatsInteractionOnly[["<child_feat>"]] <- "<parent_feat>"
glbFeatsDrop <- c(NULL
# , "<feat1>", "<feat2>"
)
glb_map_vars <- NULL # or c("<var1>", "<var2>")
glb_map_urls <- list();
# glb_map_urls[["<var1>"]] <- "<var1.url>"
# Derived features; Use this mechanism to cleanse data ??? Cons: Data duplication ???
glbFeatsDerive <- list();
# glbFeatsDerive[["<feat.my.sfx>"]] <- list(
# mapfn = function(<arg1>, <arg2>) { return(function(<arg1>, <arg2>)) }
# , args = c("<arg1>", "<arg2>"))
#myprint_df(data.frame(ImageId = mapfn(glbObsAll$.src, glbObsAll$.pos)))
#data.frame(ImageId = mapfn(glbObsAll$.src, glbObsAll$.pos))[7045:7055, ]
# character
glbFeatsDerive[["Sent.Recd.Dys"]] <- list(
mapfn = function(raw1, raw2) { return(as.numeric(difftime(strptime(raw1, format = "%m/%d/%Y"), strptime(raw2, format = "%m/%d/%Y"), units = "days"))) }
, args = c("Datesenttocompany", "Datereceived"))
#smp <- mapfn(head(glbObsAll$Datesenttocompany), head(glbObsAll$Datereceived))
glbFeatsDerive[["spcPrd"]] <- list(
mapfn = function(raw1, raw2) { return(gsub(" ", "", paste(raw1, raw2, sep = "#"))) }
, args = c("Product", "Subproduct"))
#smp <- mapfn(glbObsAll$Product, glbObsAll$Subproduct); print(table(smp, useNA = "ifany"))
glbFeatsDerive[["Rgn.Dvn"]] <- list(
mapfn = function(raw1) { raw <- gsub(" ", "XX", raw1, fixed = TRUE);
raw <- gsub("(CT|ME|MA|NH|RI|VT)", "NE#NewEngland", raw);
raw <- gsub("(NJ|NY|PA)", "NE#MidAtlantic", raw);
raw <- gsub("(IL|IN|MI|OH|WI)", "MW#EastNorthCentral", raw);
raw <- gsub("(IA|KS|MN|MO|NE|ND|SD)", "MW#WestNorthCentral", raw);
raw <- gsub("(DE|FL|GA|MD|NC|SC|VA|DC|WV)", "SH#SouthAtlantic", raw);
raw <- gsub("(AL|KY|MS|TN)", "SH#EastSouthCentral", raw);
raw <- gsub("(AR|LA|OK|TX)", "SH#WestSouthCentral", raw);
raw <- gsub("(AZ|CO|ID|MT|NV|NM|UT|WY)", "WT#Mountain", raw);
raw <- gsub("(AK|CA|HI|OR|WA)", "WT#Pacific", raw);
raw[!grepl("#", raw)] <- "OT#Other";
return(raw) }
, args = c("State"))
#smp <- mapfn(glbObsAll$State); print(sort(table(smp, useNA = "ifany")))
glbFeatsDerive[["gTags"]] <- list(
mapfn = function(raw1) { raw <- gsub("( |,)", "", raw1);
raw[raw == ""] <- "Other";
return(raw) }
, args = c("Tags"))
#smp <- mapfn(glbObsAll$Tags); print(sort(table(smp, useNA = "ifany")))
glbFeatsDerive[["gCConsent"]] <- list(
mapfn = function(raw1) { raw <- gsub("( |/)", "", raw1);
raw[raw == ""] <- "NA";
return(raw) }
, args = c("Consumerconsentprovided"))
#smp <- mapfn(glbObsAll$Consumerconsentprovided); print(sort(table(smp, useNA = "ifany")))
glbFeatsDerive[["Channel"]] <- list(
mapfn = function(raw1) { raw <- gsub(" ", "", raw1, fixed = TRUE);
return(raw) }
, args = c("Submittedvia"))
#smp <- mapfn(glbObsAll$Submittedvia); print(sort(table(smp, useNA = "ifany")))
glbFeatsDerive[["Response"]] <- list(
mapfn = function(raw1) { raw <- gsub(" ", "", raw1, fixed = TRUE);
return(raw) }
, args = c("Companyresponsetoconsumer"))
#smp <- mapfn(glbObsAll$Companyresponsetoconsumer); print(sort(table(smp, useNA = "ifany")))
# mapfn = function(Education) { raw <- Education; raw[is.na(raw)] <- "NA.my"; return(as.factor(raw)) }
# mapfn = function(Week) { return(substr(Week, 1, 10)) }
# mapfn = function(Name) { return(sapply(Name, function(thsName)
# str_sub(unlist(str_split(thsName, ","))[1], 1, 1))) }
# mapfn = function(descriptor) { return(plyr::revalue(descriptor, c(
# "ABANDONED BUILDING" = "OTHER",
# "**" = "**"
# ))) }
# mapfn = function(description) { mod_raw <- description;
# This is here because it does not work if it's in txt_map_filename
# mod_raw <- gsub(paste0(c("\n", "\211", "\235", "\317", "\333"), collapse = "|"), " ", mod_raw)
# Don't parse for "." because of ".com"; use customized gsub for that text
# mod_raw <- gsub("(\\w)(!|\\*|,|-|/)(\\w)", "\\1\\2 \\3", mod_raw);
# Some state acrnoyms need context for separation e.g.
# LA/L.A. could either be "Louisiana" or "LosAngeles"
# modRaw <- gsub("\\bL\\.A\\.( |,|')", "LosAngeles\\1", modRaw);
# OK/O.K. could either be "Oklahoma" or "Okay"
# modRaw <- gsub("\\bACA OK\\b", "ACA OKay", modRaw);
# modRaw <- gsub("\\bNow O\\.K\\.\\b", "Now OKay", modRaw);
# PR/P.R. could either be "PuertoRico" or "Public Relations"
# modRaw <- gsub("\\bP\\.R\\. Campaign", "PublicRelations Campaign", modRaw);
# VA/V.A. could either be "Virginia" or "VeteransAdministration"
# modRaw <- gsub("\\bthe V\\.A\\.\\:", "the VeteranAffairs:", modRaw);
#
# Custom mods
# return(mod_raw) }
# numeric
# Create feature based on record position/id in data
glbFeatsDerive[[".pos"]] <- list(
mapfn = function(raw1) { return(1:length(raw1)) }
, args = c(".rnorm"))
# glbFeatsDerive[[".pos.y"]] <- list(
# mapfn = function(raw1) { return(1:length(raw1)) }
# , args = c(".rnorm"))
# Add logs of numerics that are not distributed normally
# Derive & keep multiple transformations of the same feature, if normality is hard to achieve with just one transformation
# Right skew: logp1; sqrt; ^ 1/3; logp1(logp1); log10; exp(-<feat>/constant)
# glbFeatsDerive[["WordCount.log1p"]] <- list(
# mapfn = function(WordCount) { return(log1p(WordCount)) }
# , args = c("WordCount"))
# glbFeatsDerive[["WordCount.root2"]] <- list(
# mapfn = function(Sent.Recd.Dys) { return(WordCount ^ (1/2)) }
# , args = c("WordCount"))
# glbFeatsDerive[["WordCount.nexp"]] <- list(
# mapfn = function(WordCount) { return(exp(-WordCount)) }
# , args = c("WordCount"))
#print(summary(glbObsAll$WordCount))
#print(summary(mapfn(glbObsAll$WordCount)))
glbFeatsDerive[["Sent.Recd.Dys.log1p"]] <- list(
mapfn = function(raw1) { return(log1p(raw1 + 1)) } # Account for -1
, args = c("Sent.Recd.Dys"))
glbFeatsDerive[["Sent.Recd.Dys.root2"]] <- list(
mapfn = function(raw1) { return((raw1 + 1) ^ (1/2)) }
, args = c("Sent.Recd.Dys"))
glbFeatsDerive[["Sent.Recd.Dys.nexp"]] <- list(
mapfn = function(raw1) { return(exp(-raw1)) }
, args = c("Sent.Recd.Dys"))
#print(summary(glbObsAll$Sent.Recd.Dys)); print(summary(mapfn(glbObsAll$Sent.Recd.Dys)))
# If imputation shd be skipped for this feature
# glbFeatsDerive[["District.fctr"]] <- list(
# mapfn = function(District) {
# raw <- District;
# ret_vals <- rep_len("NA", length(raw));
# ret_vals[!is.na(raw)] <- sapply(raw[!is.na(raw)], function(elm)
# ifelse(elm < 10, "1-9",
# ifelse(elm < 20, "10-19", "20+")));
# return(relevel(as.factor(ret_vals), ref = "NA"))
# }
# mapfn = function(raw1) { raw <- 2016 - raw1;
# # raw[!is.na(raw) & raw >= 2010] <- NA
# raw[!is.na(raw) & (raw <= 15)] <- NA
# raw[!is.na(raw) & (raw >= 90)] <- NA
# retVal <- rep_len("NA", length(raw))
# # breaks = c(1879, seq(1949, 1989, 10), 2049)
# # cutVal <- cut(raw[!is.na(raw)], breaks = breaks,
# # labels = as.character(breaks + 1)[1:(length(breaks) - 1)])
# cutVal <- cut(raw[!is.na(raw)], breaks = c(15, 20, 25, 30, 35, 40, 50, 65, 90))
# retVal[!is.na(raw)] <- levels(cutVal)[cutVal]
# return(factor(retVal, levels = c("NA"
# ,"(15,20]","(20,25]","(25,30]","(30,35]","(35,40]","(40,50]","(50,65]","(65,90]"),
# ordered = TRUE))
# }
# , args = c("District"))
# If imputation of missing data is not working ...
# glbFeatsDerive[["FertilityRate.nonNA"]] <- list(
# mapfn = function(FertilityRate, Region) {
# RegionMdn <- tapply(FertilityRate, Region, FUN = median, na.rm = TRUE)
#
# retVal <- FertilityRate
# retVal[is.na(FertilityRate)] <- RegionMdn[Region[is.na(FertilityRate)]]
# return(retVal)
# }
# , args = c("FertilityRate", "Region"))
# for (qsn in grep("Q12", glbFeatsExclude, fixed = TRUE, value = TRUE))
# glbFeatsDerive[[paste0(qsn, ".fctr")]] <- list(
# mapfn = function(raw1) {
# raw1[raw1 %in% ""] <- "NA"
# rawVal <- unique(raw1)
#
# if (length(setdiff(rawVal, (expVal <- c("NA", "No", "Ys")))) == 0) {
# raw1 <- gsub("Yes", "Ys", raw1, fixed = TRUE)
# if (length(setdiff(rawVal, expVal)) > 0)
# stop(qsn, " vals: ", paste0(rawVal, collapse = "|"),
# " does not match expectation: ", paste0(expVal, collapse = "|"))
# } else
# if (length(setdiff(rawVal, (expVal <- c("NA", "Private", "Public")))) == 0) {
# raw1 <- gsub("Private", "Pt", raw1, fixed = TRUE)
# raw1 <- gsub("Public" , "Pc", raw1, fixed = TRUE)
# if (length(setdiff(rawVal, expVal)) > 0)
# stop(qsn, " vals: ", paste0(rawVal, collapse = "|"),
# " does not match expectation: ", paste0(expVal, collapse = "|"))
# }
#
# return(relevel(as.factor(raw1), ref = "NA"))
# }
# , args = c(qsn))
# mapfn = function(HOSPI.COST) { return(cut(HOSPI.COST, 5, breaks = c(0, 100000, 200000, 300000, 900000), labels = NULL)) }
# mapfn = function(Rasmussen) { return(ifelse(sign(Rasmussen) >= 0, 1, 0)) }
# mapfn = function(startprice) { return(startprice ^ (1/2)) }
# mapfn = function(startprice) { return(log(startprice)) }
# mapfn = function(startprice) { return(exp(-startprice / 20)) }
# mapfn = function(startprice) { return(scale(log(startprice))) }
# mapfn = function(startprice) { return(sign(sprice.predict.diff) * (abs(sprice.predict.diff) ^ (1/10))) }
# factor
# mapfn = function(PropR) { return(as.factor(ifelse(PropR >= 0.5, "Y", "N"))) }
# mapfn = function(productline, description) { as.factor(gsub(" ", "", productline)) }
# mapfn = function(purpose) { return(relevel(as.factor(purpose), ref="all_other")) }
# mapfn = function(raw) { tfr_raw <- as.character(cut(raw, 5));
# tfr_raw[is.na(tfr_raw)] <- "NA.my";
# return(as.factor(tfr_raw)) }
# mapfn = function(startprice.log10) { return(cut(startprice.log10, 3)) }
# mapfn = function(startprice.log10) { return(cut(sprice.predict.diff, c(-1000, -100, -10, -1, 0, 1, 10, 100, 1000))) }
# , args = c("<arg1>"))
# multiple args
# mapfn = function(id, date) { return(paste(as.character(id), as.character(date), sep = "#")) }
# mapfn = function(PTS, oppPTS) { return(PTS - oppPTS) }
# mapfn = function(startprice.log10.predict, startprice) {
# return(spdiff <- (10 ^ startprice.log10.predict) - startprice) }
# mapfn = function(productline, description) { as.factor(
# paste(gsub(" ", "", productline), as.numeric(nchar(description) > 0), sep = "*")) }
# mapfn = function(.src, .pos) {
# return(paste(.src, sprintf("%04d",
# ifelse(.src == "Train", .pos, .pos - 7049)
# ), sep = "#")) }
# # If glbObsAll is not sorted in the desired manner
# mapfn=function(Week) { return(coredata(lag(zoo(orderBy(~Week, glbObsAll)$ILI), -2, na.pad=TRUE))) }
# mapfn=function(ILI) { return(coredata(lag(zoo(ILI), -2, na.pad=TRUE))) }
# mapfn=function(ILI.2.lag) { return(log(ILI.2.lag)) }
# glbFeatsDerive[["<var1>"]] <- glbFeatsDerive[["<var2>"]]
# tst <- "descr.my"; args_lst <- NULL; for (arg in glbFeatsDerive[[tst]]$args) args_lst[[arg]] <- glbObsAll[, arg]; print(head(args_lst[[arg]])); print(head(drv_vals <- do.call(glbFeatsDerive[[tst]]$mapfn, args_lst)));
# print(which_ix <- which(args_lst[[arg]] == 0.75)); print(drv_vals[which_ix]);
glbFeatsDateTime <- list()
# Use OlsonNames() to enumerate supported time zones
# glbFeatsDateTime[["<DateTimeFeat>"]] <-
# c(format = "%Y-%m-%d %H:%M:%S" or "%m/%e/%y", timezone = "US/Eastern", impute.na = TRUE,
# last.ctg = FALSE, poly.ctg = FALSE)
glbFeatsDateTime[["Datereceived"]] <-
c(format = "%m/%d/%Y", impute.na = TRUE, last.ctg = FALSE, poly.ctg = FALSE)
glbFeatsDateTime[["Datesenttocompany"]] <-
c(format = "%m/%d/%Y", impute.na = TRUE, last.ctg = FALSE, poly.ctg = FALSE)
glbFeatsPrice <- NULL # or c("<price_var>")
glbFeatsImage <- list() #list(<imageFeat> = list(patchSize = 10)) # if patchSize not specified, no patch computation
glbFeatsText <- list()
Sys.setlocale("LC_ALL", "C") # For english
## [1] "C/C/C/C/C/en_US.UTF-8"
#glbFeatsText[["<TextFeature>"]] <- list(NULL,
# ,names = myreplacePunctuation(str_to_lower(gsub(" ", "", c(NULL,
# <comma-separated-screened-names>
# ))))
# ,rareWords = myreplacePunctuation(str_to_lower(gsub(" ", "", c(NULL,
# <comma-separated-nonSCOWL-words>
# ))))
#)
# Text Processing Step: custom modifications not present in txt_munge -> use glbFeatsDerive
# Text Processing Step: universal modifications
glb_txt_munge_filenames_pfx <- "<projectId>_mytxt_"
# Text Processing Step: tolower
# Text Processing Step: myreplacePunctuation
# Text Processing Step: removeWords
glb_txt_stop_words <- list()
# Remember to use unstemmed words
if (length(glbFeatsText) > 0) {
require(tm)
require(stringr)
glb_txt_stop_words[["<txt_var>"]] <-
sort(myreplacePunctuation(str_to_lower(gsub(" ", "", c(NULL
# Remove any words from stopwords
# , setdiff(myreplacePunctuation(stopwords("english")), c("<keep_wrd1>", <keep_wrd2>"))
# Remove salutations
,"mr","mrs","dr","Rev"
# Remove misc
#,"th" # Happy [[:digit::]]+th birthday
# Remove terms present in Trn only or New only; search for "Partition post-stem"
# ,<comma-separated-terms>
# cor.y.train == NA
# ,unlist(strsplit(paste(c(NULL
# ,"<comma-separated-terms>"
# ), collapse=",")
# freq == 1; keep c("<comma-separated-terms-to-keep>")
# ,<comma-separated-terms>
# chisq.pval high (e.g. == 1); keep c("<comma-separated-terms-to-keep>")
# ,<comma-separated-terms>
# nzv.freqRatio high (e.g. >= glbFeatsNzvFreqMax); keep c("<comma-separated-terms-to-keep>")
# ,<comma-separated-terms>
)))))
}
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txtFeat]][grep("^man", glb_post_stem_words_terms_df_lst[[txtFeat]]$term), ])
#glbObsAll[glb_post_stem_words_terms_mtrx_lst[[txtFeat]][, 4866] > 0, c(glb_rsp_var, txtFeat)]
# To identify terms with a specific freq
#paste0(sort(subset(glb_post_stop_words_terms_df_lst[[txtFeat]], freq == 1)$term), collapse = ",")
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], freq <= 2)$term), collapse = ",")
#subset(glb_post_stem_words_terms_df_lst[[txtFeat]], term %in% c("zinger"))
# To identify terms with a specific freq &
# are not stemmed together later OR is value of color.fctr (e.g. gold)
#paste0(sort(subset(glb_post_stop_words_terms_df_lst[[txtFeat]], (freq == 1) & !(term %in% c("blacked","blemish","blocked","blocks","buying","cables","careful","carefully","changed","changing","chargers","cleanly","cleared","connect","connects","connected","contains","cosmetics","default","defaulting","defective","definitely","describe","described","devices","displays","drop","drops","engravement","excellant","excellently","feels","fix","flawlessly","frame","framing","gentle","gold","guarantee","guarantees","handled","handling","having","install","iphone","iphones","keeped","keeps","known","lights","line","lining","liquid","liquidation","looking","lots","manuals","manufacture","minis","most","mostly","network","networks","noted","opening","operated","performance","performs","person","personalized","photograph","physically","placed","places","powering","pre","previously","products","protection","purchasing","returned","rotate","rotation","running","sales","second","seconds","shipped","shuts","sides","skin","skinned","sticker","storing","thats","theres","touching","unusable","update","updates","upgrade","weeks","wrapped","verified","verify") ))$term), collapse = ",")
#print(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], (freq <= 2)))
#glbObsAll[which(terms_mtrx[, 229] > 0), glbFeatsText]
# To identify terms with cor.y == NA
#orderBy(~-freq+term, subset(glb_post_stop_words_terms_df_lst[[txtFeat]], is.na(cor.y)))
#paste(sort(subset(glb_post_stop_words_terms_df_lst[[txtFeat]], is.na(cor.y))[, "term"]), collapse=",")
#orderBy(~-freq+term, subset(glb_post_stem_words_terms_df_lst[[txtFeat]], is.na(cor.y)))
# To identify terms with low cor.y.abs
#head(orderBy(~cor.y.abs+freq+term, subset(glb_post_stem_words_terms_df_lst[[txtFeat]], !is.na(cor.y))), 5)
# To identify terms with high chisq.pval
#subset(glb_post_stem_words_terms_df_lst[[txtFeat]], chisq.pval > 0.99)
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], (chisq.pval > 0.99) & (freq <= 10))$term), collapse=",")
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], (chisq.pval > 0.9))$term), collapse=",")
#head(orderBy(~-chisq.pval+freq+term, glb_post_stem_words_terms_df_lst[[txtFeat]]), 5)
#glbObsAll[glb_post_stem_words_terms_mtrx_lst[[txtFeat]][, 68] > 0, glbFeatsText]
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txtFeat]][grep("^m", glb_post_stem_words_terms_df_lst[[txtFeat]]$term), ])
# To identify terms with high nzv.freqRatio
#summary(glb_post_stem_words_terms_df_lst[[txtFeat]]$nzv.freqRatio)
#paste0(sort(setdiff(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], (nzv.freqRatio >= glbFeatsNzvFreqMax) & (freq < 10) & (chisq.pval >= 0.05))$term, c( "128gb","3g","4g","gold","ipad1","ipad3","ipad4","ipadair2","ipadmini2","manufactur","spacegray","sprint","tmobil","verizon","wifion"))), collapse=",")
# To identify obs with a txt term
#tail(orderBy(~-freq+term, glb_post_stop_words_terms_df_lst[[txtFeat]]), 20)
#mydspObs(list(descr.my.contains="non"), cols=c("color", "carrier", "cellular", "storage"))
#grep("ever", dimnames(terms_stop_mtrx)$Terms)
#which(terms_stop_mtrx[, grep("ipad", dimnames(terms_stop_mtrx)$Terms)] > 0)
#glbObsAll[which(terms_stop_mtrx[, grep("16", dimnames(terms_stop_mtrx)$Terms)[1]] > 0), c(glbFeatsCategory, "storage", txtFeat)]
# Text Processing Step: screen for names # Move to glbFeatsText specs section in order of text processing steps
# glbFeatsText[["<txtFeat>"]]$names <- myreplacePunctuation(str_to_lower(gsub(" ", "", c(NULL
# # Person names for names screening
# ,<comma-separated-list>
#
# # Company names
# ,<comma-separated-list>
#
# # Product names
# ,<comma-separated-list>
# ))))
# glbFeatsText[["<txtFeat>"]]$rareWords <- myreplacePunctuation(str_to_lower(gsub(" ", "", c(NULL
# # Words not in SCOWL db
# ,<comma-separated-list>
# ))))
# To identify char vectors post glbFeatsTextMap
#grep("six(.*)hour", glb_txt_chr_lst[[txtFeat]], ignore.case = TRUE, value = TRUE)
#grep("[S|s]ix(.*)[H|h]our", glb_txt_chr_lst[[txtFeat]], value = TRUE)
# To identify whether terms shd be synonyms
#orderBy(~term, glb_post_stop_words_terms_df_lst[[txtFeat]][grep("^moder", glb_post_stop_words_terms_df_lst[[txtFeat]]$term), ])
# term_row_df <- glb_post_stop_words_terms_df_lst[[txtFeat]][grep("^came$", glb_post_stop_words_terms_df_lst[[txtFeat]]$term), ]
#
# cor(glb_post_stop_words_terms_mtrx_lst[[txtFeat]][glbObsAll$.lcn == "Fit", term_row_df$pos], glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
# To identify which stopped words are "close" to a txt term
#sort(glbFeatsCluster)
# Text Processing Step: stemDocument
# To identify stemmed txt terms
#glb_post_stop_words_terms_df_lst[[txtFeat]][grep("^la$", glb_post_stop_words_terms_df_lst[[txtFeat]]$term), ]
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txtFeat]][grep("^con", glb_post_stem_words_terms_df_lst[[txtFeat]]$term), ])
#glbObsAll[which(terms_stem_mtrx[, grep("use", dimnames(terms_stem_mtrx)$Terms)[[1]]] > 0), c(glbFeatsId, "productline", txtFeat)]
#glbObsAll[which(TfIdf_stem_mtrx[, 191] > 0), c(glbFeatsId, glbFeatsCategory, txtFeat)]
#glbObsAll[which(glb_post_stop_words_terms_mtrx_lst[[txtFeat]][, 6165] > 0), c(glbFeatsId, glbFeatsCategory, txtFeat)]
#which(glbObsAll$UniqueID %in% c(11915, 11926, 12198))
# Text Processing Step: mycombineSynonyms
# To identify which terms are associated with not -> combine "could not" & "couldn't"
#findAssocs(glb_full_DTM_lst[[txtFeat]], "not", 0.05)
# To identify which synonyms should be combined
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txtFeat]][grep("^c", glb_post_stem_words_terms_df_lst[[txtFeat]]$term), ])
chk_comb_cor <- function(syn_lst) {
# cor(terms_stem_mtrx[glbObsAll$.src == "Train", grep("^(damag|dent|ding)$", dimnames(terms_stem_mtrx)[[2]])], glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
print(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], term %in% syn_lst$syns))
print(subset(get_corpus_terms(tm_map(glbFeatsTextCorpus[[txtFeat]], mycombineSynonyms, list(syn_lst), lazy=FALSE)), term == syn_lst$word))
# cor(terms_stop_mtrx[glbObsAll$.src == "Train", grep("^(damage|dent|ding)$", dimnames(terms_stop_mtrx)[[2]])], glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
# cor(rowSums(terms_stop_mtrx[glbObsAll$.src == "Train", grep("^(damage|dent|ding)$", dimnames(terms_stop_mtrx)[[2]])]), glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
}
#chk_comb_cor(syn_lst=list(word="cabl", syns=c("cabl", "cord")))
#chk_comb_cor(syn_lst=list(word="damag", syns=c("damag", "dent", "ding")))
#chk_comb_cor(syn_lst=list(word="dent", syns=c("dent", "ding")))
#chk_comb_cor(syn_lst=list(word="use", syns=c("use", "usag")))
glbFeatsTextSynonyms <- list()
# list parsed to collect glbFeatsText[[<txtFeat>]]$vldTerms
# glbFeatsTextSynonyms[["Hdln.my"]] <- list(NULL
# # people in places
# , list(word = "australia", syns = c("australia", "australian"))
# , list(word = "italy", syns = c("italy", "Italian"))
# , list(word = "newyork", syns = c("newyork", "newyorker"))
# , list(word = "Pakistan", syns = c("Pakistan", "Pakistani"))
# , list(word = "peru", syns = c("peru", "peruvian"))
# , list(word = "qatar", syns = c("qatar", "qatari"))
# , list(word = "scotland", syns = c("scotland", "scotish"))
# , list(word = "Shanghai", syns = c("Shanghai", "Shanzhai"))
# , list(word = "venezuela", syns = c("venezuela", "venezuelan"))
#
# # companies - needs to be data dependent
# # - e.g. ensure BNP in this experiment/feat always refers to BNPParibas
#
# # general synonyms
# , list(word = "Create", syns = c("Create","Creator"))
# , list(word = "cute", syns = c("cute","cutest"))
# , list(word = "Disappear", syns = c("Disappear","Fadeout"))
# , list(word = "teach", syns = c("teach", "taught"))
# , list(word = "theater", syns = c("theater", "theatre", "theatres"))
# , list(word = "understand", syns = c("understand", "understood"))
# , list(word = "weak", syns = c("weak", "weaken", "weaker", "weakest"))
# , list(word = "wealth", syns = c("wealth", "wealthi"))
#
# # custom synonyms (phrases)
#
# # custom synonyms (names)
# )
#glbFeatsTextSynonyms[["<txtFeat>"]] <- list(NULL
# , list(word="<stem1>", syns=c("<stem1>", "<stem1_2>"))
# )
for (txtFeat in names(glbFeatsTextSynonyms))
for (entryIx in 1:length(glbFeatsTextSynonyms[[txtFeat]])) {
glbFeatsTextSynonyms[[txtFeat]][[entryIx]]$word <-
str_to_lower(glbFeatsTextSynonyms[[txtFeat]][[entryIx]]$word)
glbFeatsTextSynonyms[[txtFeat]][[entryIx]]$syns <-
str_to_lower(glbFeatsTextSynonyms[[txtFeat]][[entryIx]]$syns)
}
glbFeatsTextSeed <- 181
# tm options include: check tm::weightSMART
glb_txt_terms_control <- list( # Gather model performance & run-time stats
# weighting = function(x) weightSMART(x, spec = "nnn")
# weighting = function(x) weightSMART(x, spec = "lnn")
# weighting = function(x) weightSMART(x, spec = "ann")
# weighting = function(x) weightSMART(x, spec = "bnn")
# weighting = function(x) weightSMART(x, spec = "Lnn")
#
weighting = function(x) weightSMART(x, spec = "ltn") # default
# weighting = function(x) weightSMART(x, spec = "lpn")
#
# weighting = function(x) weightSMART(x, spec = "ltc")
#
# weighting = weightBin
# weighting = weightTf
# weighting = weightTfIdf # : default
# termFreq selection criteria across obs: tm default: list(global=c(1, Inf))
, bounds = list(global = c(1, Inf))
# wordLengths selection criteria: tm default: c(3, Inf)
, wordLengths = c(1, Inf)
)
glb_txt_cor_var <- glb_rsp_var # : default # or c(<feat>)
# select one from c("union.top.val.cor", "top.cor", "top.val", default: "top.chisq", "sparse")
glbFeatsTextFilter <- "top.chisq"
glbFeatsTextTermsMax <- rep(10, length(glbFeatsText)) # :default
names(glbFeatsTextTermsMax) <- names(glbFeatsText)
# Text Processing Step: extractAssoc
glbFeatsTextAssocCor <- rep(1, length(glbFeatsText)) # :default
names(glbFeatsTextAssocCor) <- names(glbFeatsText)
# Remember to use stemmed terms
glb_important_terms <- list()
# Text Processing Step: extractPatterns (ngrams)
glbFeatsTextPatterns <- list()
#glbFeatsTextPatterns[[<txtFeat>>]] <- list()
#glbFeatsTextPatterns[[<txtFeat>>]] <- c(metropolitan.diary.colon = "Metropolitan Diary:")
# Have to set it even if it is not used
# Properties:
# numrows(glb_feats_df) << numrows(glbObsFit
# Select terms that appear in at least 0.2 * O(FP/FN(glbObsOOB)) ???
# numrows(glbObsOOB) = 1.1 * numrows(glbObsNew) ???
glb_sprs_thresholds <- NULL # or c(<txtFeat1> = 0.988, <txtFeat2> = 0.970, <txtFeat3> = 0.970)
glbFctrMaxUniqVals <- 20 # default: 20
glb_impute_na_data <- FALSE # or TRUE
glb_mice_complete.seed <- 144 # or any integer
glbFeatsCluster <- NULL # NULL: default or c("<feat1>", "<feat2>")
glb_cluster.seed <- 189 # or any integer
glbClusterEntropyVar <- NULL # c(glb_rsp_var, as.factor(cut(glb_rsp_var, 3)), default: NULL)
glbFeatsClusterVarsExclude <- FALSE # default FALSE
glb_interaction_only_feats <- NULL # : default or c(<parent_feat> = "<child_feat>")
glbFeatsNzvFreqMax <- 19 # 19 : caret default
glbFeatsNzvUniqMin <- 10 # 10 : caret default
glbRFESizes <- list()
#glbRFESizes[["mdlFamily"]] <- c(4, 8, 16, 32, 64, 67, 68, 69) # Accuracy@69/70 = 0.8258
glbRFEResults <- NULL
glbObsFitOutliers <- list()
# If outliers.n >= 10; consider concatenation of interaction vars
# glbObsFitOutliers[["<mdlFamily>"]] <- c(NULL
# is.na(.rstudent)
# max(.rstudent)
# is.na(.dffits)
# .hatvalues >= 0.99
# -38,167,642 < minmax(.rstudent) < 49,649,823
# , <comma-separated-<glbFeatsId>>
# )
glbObsTrnOutliers <- list()
glbObsTrnOutliers[["Final"]] <- union(glbObsFitOutliers[["All.X"]],
c(NULL
))
# Modify mdlId to (build & extract) "<FamilyId>#<Fit|Trn>#<caretMethod>#<preProc1.preProc2>#<samplingMethod>"
glb_models_lst <- list(); glb_models_df <- data.frame()
# Add xgboost algorithm
# Regression
if (glb_is_regression) {
glbMdlMethods <- c(NULL
# deterministic
#, "lm", # same as glm
, "glm", "bayesglm", "glmnet"
, "rpart"
# non-deterministic
, "gbm", "rf"
# Unknown
, "nnet" , "avNNet" # runs 25 models per cv sample for tunelength=5
, "svmLinear", "svmLinear2"
, "svmPoly" # runs 75 models per cv sample for tunelength=5
, "svmRadial"
, "earth"
, "bagEarth" # Takes a long time
,"xgbLinear","xgbTree"
)
} else
# Classification - Add ada (auto feature selection)
if (glb_is_binomial)
glbMdlMethods <- c(NULL
# deterministic
, "bagEarth" # Takes a long time
, "glm", "bayesglm", "glmnet"
, "nnet"
, "rpart"
# non-deterministic
, "gbm"
, "avNNet" # runs 25 models per cv sample for tunelength=5
, "rf"
# Unknown
, "lda", "lda2"
# svm models crash when predict is called -> internal to kernlab it should call predict without .outcome
, "svmLinear", "svmLinear2"
, "svmPoly" # runs 75 models per cv sample for tunelength=5
, "svmRadial"
, "earth"
,"xgbLinear","xgbTree"
) else
glbMdlMethods <- c(NULL
# deterministic
,"glmnet"
# non-deterministic
,"rf"
# Unknown
,"gbm","rpart","xgbLinear","xgbTree"
)
glbMdlFamilies <- list(); glb_mdl_feats_lst <- list()
# family: Choose from c("RFE.X", "Csm.X", "All.X", "Best.Interact") %*% c(NUll, ".NOr", ".Inc")
# RFE = "Recursive Feature Elimination"
# Csm = CuStoM
# NOr = No OutlieRs
# Inc = INteraCt
# methods: Choose from c(NULL, <method>, glbMdlMethods)
#glbMdlFamilies[["RFE.X"]] <- c("glmnet", "glm") # non-NULL vector is mandatory
if (glb_is_classification && !glb_is_binomial) {
# glm does not work for multinomial
glbMdlFamilies[["All.X"]] <- c("glmnet")
} else {
glbMdlFamilies[["All.X"]] <- c("glmnet", "glm")
}
#glbMdlFamilies[["All.X.Inc"]] <- glbMdlFamilies[["All.X"]] # value not used
# Check if interaction features make RFE better
# glbMdlFamilies[["CSM.X"]] <- setdiff(glbMdlMethods, c("lda", "lda2")) # crashing due to category:.clusterid ??? #c("glmnet", "glm") # non-NULL list is mandatory
# glb_mdl_feats_lst[["CSM.X"]] <- c(NULL
# , <comma-separated-features-vector>
# )
# dAFeats.CSM.X %<d-% c(NULL
# # Interaction feats up to varImp(RFE.X.glmnet) >= 50
# , <comma-separated-features-vector>
# , setdiff(myextract_actual_feats(predictors(glbRFEResults)), c(NULL
# , <comma-separated-features-vector>
# ))
# )
# glb_mdl_feats_lst[["CSM.X"]] <- "%<d-% dAFeats.CSM.X"
glbMdlFamilies[["Final"]] <- c(NULL) # NULL vector acceptable # c("glmnet", "glm")
glbMdlSequential <- c(NULL
#, "All.X#zv.pca#rcv#glmnet"
)
# Check if tuning parameters make fit better; make it mdlFamily customizable ?
glbMdlTuneParams <- data.frame()
# When glmnet crashes at model$grid with error: ???
# AllX__rcv_glmnetTuneParams <- rbind(data.frame()
# ,data.frame(parameter = "alpha", vals = "0.100 0.325 0.550 0.775 1.000")
# ,data.frame(parameter = "lambda", vals = "9.342e-02")
# ) # max.Accuracy.OOB = 0.5956175 @ 0.325 0.03345007
# glbMdlTuneParams <- rbind(glbMdlTuneParams
# ,cbind(data.frame(mdlId = "All.X##rcv#glmnet"), AllX__rcv_glmnetTuneParams)
# ,cbind(data.frame(mdlId = "Final.All.X##rcv#glmnet"), FinalAllX__rcv_glmnetTuneParams)
# )
#avNNet
# size=[1] 3 5 7 9; decay=[0] 1e-04 0.001 0.01 0.1; bag=[FALSE]; RMSE=1.3300906
#bagEarth
# degree=1 [2] 3; nprune=64 128 256 512 [1024]; RMSE=0.6486663 (up)
# bagEarthTuneParams <- rbind(data.frame()
# ,data.frame(parameter = "degree", vals = "1")
# ,data.frame(parameter = "nprune", vals = "256")
# )
#earth
# degree=[1]; nprune=2 [9] 17 25 33; RMSE=0.1334478
#gbm
# shrinkage=0.05 [0.10] 0.15 0.20 0.25; n.trees=100 150 200 [250] 300; interaction.depth=[1] 2 3 4 5; n.minobsinnode=[10]; RMSE=0.2008313
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "gbm", parameter = "shrinkage", min = 0.05, max = 0.25, by = 0.05)
# ,data.frame(method = "gbm", parameter = "n.trees", min = 100, max = 300, by = 50)
# ,data.frame(method = "gbm", parameter = "interaction.depth", min = 1, max = 5, by = 1)
# ,data.frame(method = "gbm", parameter = "n.minobsinnode", min = 10, max = 10, by = 10)
# #seq(from=0.05, to=0.25, by=0.05)
# ))
#glmnet
# alpha=0.100 [0.325] 0.550 0.775 1.000; lambda=0.0005232693 0.0024288010 0.0112734954 [0.0523269304] 0.2428800957; RMSE=0.6164891
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "glmnet", parameter = "alpha", vals = "0.550 0.775 0.8875 0.94375 1.000")
# ,data.frame(method = "glmnet", parameter = "lambda", vals = "9.858855e-05 0.0001971771 0.0009152152 0.0042480525 0.0197177130")
# ))
#nnet
# size=3 5 [7] 9 11; decay=0.0001 0.001 0.01 [0.1] 0.2; RMSE=0.9287422
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "nnet", parameter = "size", vals = "3 5 7 9 11")
# ,data.frame(method = "nnet", parameter = "decay", vals = "0.0001 0.0010 0.0100 0.1000 0.2000")
# ))
#rf # Don't bother; results are not deterministic
# mtry=2 35 68 [101] 134; RMSE=0.1339974
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "rf", parameter = "mtry", vals = "2 5 9 13 17")
# ))
#rpart
# cp=0.020 [0.025] 0.030 0.035 0.040; RMSE=0.1770237
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "rpart", parameter = "cp", vals = "0.004347826 0.008695652 0.017391304 0.021739130 0.034782609")
# ))
#svmLinear
# C=0.01 0.05 [0.10] 0.50 1.00 2.00 3.00 4.00; RMSE=0.1271318; 0.1296718
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "svmLinear", parameter = "C", vals = "0.01 0.05 0.1 0.5 1")
# ))
#svmLinear2
# cost=0.0625 0.1250 [0.25] 0.50 1.00; RMSE=0.1276354
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "svmLinear2", parameter = "cost", vals = "0.0625 0.125 0.25 0.5 1")
# ))
#svmPoly
# degree=[1] 2 3 4 5; scale=0.01 0.05 [0.1] 0.5 1; C=0.50 1.00 [2.00] 3.00 4.00; RMSE=0.1276130
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method="svmPoly", parameter="degree", min=1, max=5, by=1) #seq(1, 5, 1)
# ,data.frame(method="svmPoly", parameter="scale", vals="0.01, 0.05, 0.1, 0.5, 1")
# ,data.frame(method="svmPoly", parameter="C", vals="0.50, 1.00, 2.00, 3.00, 4.00")
# ))
#svmRadial
# sigma=[0.08674323]; C=0.25 0.50 1.00 [2.00] 4.00; RMSE=0.1614957
#glb2Sav(); all.equal(sav_models_df, glb_models_df)
pkgPreprocMethods <-
# caret version: 6.0.068 # packageVersion("caret")
# operations are applied in this order: zero-variance filter, near-zero variance filter, Box-Cox/Yeo-Johnson/exponential transformation, centering, scaling, range, imputation, PCA, ICA then spatial sign
# *Impute methods needed only if NAs are fed to myfit_mdl
# Also, ordered.factor in caret creates features as Edn.fctr^4 which is treated as an exponent by bagImpute
c(NULL
,"zv", "nzv"
,"BoxCox", "YeoJohnson", "expoTrans"
,"center", "scale", "center.scale", "range"
,"knnImpute", "bagImpute", "medianImpute"
,"zv.pca", "ica", "spatialSign"
,"conditionalX")
glbMdlPreprocMethods <- list(NULL # default
# ,"All.X" = list("glmnet" = union(setdiff(pkgPreprocMethods,
# c("knnImpute", "bagImpute", "medianImpute")),
# # c(NULL)))
# c("zv.YeoJohnson.pca")))
# ,"RFE.X" = list("glmnet" = union(setdiff(pkgPreprocMethods,
# c("knnImpute", "bagImpute", "medianImpute")),
# c(NULL)))
# # c("zv.pca.spatialSign")))
)
# glbMdlPreprocMethods[["RFE.X"]] <- list("glmnet" = union(unlist(glbMdlPreprocMethods[["All.X"]]),
# "nzv.pca.spatialSign"))
# Baseline prediction model feature(s)
glb_Baseline_mdl_var <- NULL # or c("<feat>")
glbMdlMetric_terms <- NULL # or matrix(c(
# 0,1,2,3,4,
# 2,0,1,2,3,
# 4,2,0,1,2,
# 6,4,2,0,1,
# 8,6,4,2,0
# ), byrow=TRUE, nrow=5)
glbMdlMetricSummary <- NULL # or "<metric_name>"
glbMdlMetricMaximize <- NULL # or FALSE (TRUE is not the default for both classification & regression)
glbMdlMetricSummaryFn <- NULL # or function(data, lev=NULL, model=NULL) {
# confusion_mtrx <- t(as.matrix(confusionMatrix(data$pred, data$obs)))
# #print(confusion_mtrx)
# #print(confusion_mtrx * glbMdlMetric_terms)
# metric <- sum(confusion_mtrx * glbMdlMetric_terms) / nrow(data)
# names(metric) <- glbMdlMetricSummary
# return(metric)
# }
glbMdlCheckRcv <- FALSE # Turn it on when needed; otherwise takes long time
glb_rcv_n_folds <- 7 # or NULL
glb_rcv_n_repeats <- 3 # or NULL
glb_clf_proba_threshold <- NULL # 0.5
# Model selection criteria
if (glb_is_regression)
glbMdlMetricsEval <- c("min.RMSE.OOB", "max.R.sq.OOB", "min.elapsedtime.everything",
"max.Adj.R.sq.fit", "min.RMSE.fit")
#glbMdlMetricsEval <- c("min.RMSE.fit", "max.R.sq.fit", "max.Adj.R.sq.fit")
if (glb_is_classification) {
if (glb_is_binomial)
glbMdlMetricsEval <-
c("max.Accuracy.OOB", "max.AUCROCR.OOB", "max.AUCpROC.OOB",
"min.elapsedtime.everything",
# "min.aic.fit",
"max.Accuracy.fit") else
glbMdlMetricsEval <- c("max.Accuracy.OOB", "max.Kappa.OOB", "min.elapsedtime.everything")
}
# select from NULL [no ensemble models], "auto" [all models better than MFO or Baseline], c(mdl_ids in glb_models_lst) [Typically top-rated models in auto]
glbMdlEnsemble <- NULL # default # options: "auto"
# "%<d-% setdiff(mygetEnsembleAutoMdlIds(), 'CSM.X.rf')"
# c(<comma-separated-mdlIds>
# )
glbMdlEnsembleSampleMethods <- c("boot", "boot632", "cv", "repeatedcv"
# , "LOOCV" # tuneLength * nrow(fitDF) # way too many models
, "LGOCV"
, "adaptive_cv"
, "adaptive_boot"
, "adaptive_LGOCV"
)
# Only for classifications; for regressions remove "(.*)\\.prob" form the regex
# tmp_fitobs_df <- glbObsFit[, grep(paste0("^", gsub(".", "\\.", mygetPredictIds$value, fixed = TRUE), "CSM\\.X\\.(.*)\\.prob"), names(glbObsFit), value = TRUE)]; cor_mtrx <- cor(tmp_fitobs_df); cor_vctr <- sort(cor_mtrx[row.names(orderBy(~-Overall, varImp(glb_models_lst[["Ensemble.repeatedcv.glmnet"]])$imp))[1], ]); summary(cor_vctr); cor_vctr
#ntv.glm <- glm(reformulate(indepVar, glb_rsp_var), family = "binomial", data = glbObsFit)
#step.glm <- step(ntv.glm)
glbMdlSltId <- NULL #select from c(NULL, "All.X##rcv#glmnet", "RFE.X##rcv#glmnet", <mdlId>)
glbMdlFnlId <- NULL #select from c(NULL, glbMdlSltId)
glbMdlFnlNslId <- NULL
glbMdlFnlRslId <- NULL
glb_dsp_cols <- c(".pos", glbFeatsId, glbFeatsCategory, glb_rsp_var
# List critical cols excl. above
)
# Output specs
# lclgetfltout_df <- function(obsOutDf) {
# require(tidyr)
# obsOutDf <- obsOutDf %>%
# tidyr::separate("ImageId.x.y", c(".src", ".pos", "x", "y"),
# sep = "#", remove = TRUE, extra = "merge")
# # mnm prefix stands for max_n_mean
# mnmout_df <- obsOutDf %>%
# dplyr::group_by(.pos) %>%
# #dplyr::top_n(1, Probability1) %>% # Score = 3.9426
# #dplyr::top_n(2, Probability1) %>% # Score = ???; weighted = 3.94254;
# #dplyr::top_n(3, Probability1) %>% # Score = 3.9418; weighted = 3.94169;
# dplyr::top_n(4, Probability1) %>% # Score = ???; weighted = 3.94149;
# #dplyr::top_n(5, Probability1) %>% # Score = 3.9421; weighted = 3.94178
#
# # dplyr::summarize(xMeanN = mean(as.numeric(x)), yMeanN = mean(as.numeric(y)))
# # dplyr::summarize(xMeanN = weighted.mean(as.numeric(x), Probability1), yMeanN = mean(as.numeric(y)))
# # dplyr::summarize(xMeanN = weighted.mean(as.numeric(x), c(Probability1, 0.2357323, 0.2336925)), yMeanN = mean(as.numeric(y)))
# # dplyr::summarize(xMeanN = weighted.mean(as.numeric(x), c(Probability1)), yMeanN = mean(as.numeric(y)))
# dplyr::summarize(xMeanN = weighted.mean(as.numeric(x), c(Probability1)),
# yMeanN = weighted.mean(as.numeric(y), c(Probability1)))
#
# maxout_df <- obsOutDf %>%
# dplyr::group_by(.pos) %>%
# dplyr::summarize(maxProb1 = max(Probability1))
# fltout_df <- merge(maxout_df, obsOutDf,
# by.x = c(".pos", "maxProb1"), by.y = c(".pos", "Probability1"),
# all.x = TRUE)
# fmnout_df <- merge(fltout_df, mnmout_df,
# by.x = c(".pos"), by.y = c(".pos"),
# all.x = TRUE)
# return(fmnout_df)
# }
glbObsOut <- list(NULL
# glbFeatsId will be the first output column, by default
,vars = list()
# ,mapFn = function(obsOutDf) {
# }
)
#obsOutFnlNslDf <- savobsOutFnlNslDf
# glbObsOut$mapFn <- function(obsOutDf) {
# txfout_df <- dplyr::select(obsOutDf, -.pos.y) %>%
# dplyr::mutate(
# lunch = levels(glbObsTrn[, "lunch" ])[
# round(mean(as.numeric(glbObsTrn[, "lunch" ])), 0)],
# dinner = levels(glbObsTrn[, "dinner" ])[
# round(mean(as.numeric(glbObsTrn[, "dinner" ])), 0)],
# reserve = levels(glbObsTrn[, "reserve" ])[
# round(mean(as.numeric(glbObsTrn[, "reserve" ])), 0)],
# outdoor = levels(glbObsTrn[, "outdoor" ])[
# round(mean(as.numeric(glbObsTrn[, "outdoor" ])), 0)],
# expensive = levels(glbObsTrn[, "expensive"])[
# round(mean(as.numeric(glbObsTrn[, "expensive"])), 0)],
# liquor = levels(glbObsTrn[, "liquor" ])[
# round(mean(as.numeric(glbObsTrn[, "liquor" ])), 0)],
# table = levels(glbObsTrn[, "table" ])[
# round(mean(as.numeric(glbObsTrn[, "table" ])), 0)],
# classy = levels(glbObsTrn[, "classy" ])[
# round(mean(as.numeric(glbObsTrn[, "classy" ])), 0)],
# kids = levels(glbObsTrn[, "kids" ])[
# round(mean(as.numeric(glbObsTrn[, "kids" ])), 0)]
# )
#
# print("ObsNew output class tables:")
# print(sapply(c("lunch","dinner","reserve","outdoor",
# "expensive","liquor","table",
# "classy","kids"),
# function(feat) table(txfout_df[, feat], useNA = "ifany")))
#
# txfout_df <- txfout_df %>%
# dplyr::mutate(labels = "") %>%
# dplyr::mutate(labels =
# ifelse(lunch != "-1", paste(labels, lunch ), labels)) %>%
# dplyr::mutate(labels =
# ifelse(dinner != "-1", paste(labels, dinner ), labels)) %>%
# dplyr::mutate(labels =
# ifelse(reserve != "-1", paste(labels, reserve ), labels)) %>%
# dplyr::mutate(labels =
# ifelse(outdoor != "-1", paste(labels, outdoor ), labels)) %>%
# dplyr::mutate(labels =
# ifelse(expensive != "-1", paste(labels, expensive), labels)) %>%
# dplyr::mutate(labels =
# ifelse(liquor != "-1", paste(labels, liquor ), labels)) %>%
# dplyr::mutate(labels =
# ifelse(table != "-1", paste(labels, table ), labels)) %>%
# dplyr::mutate(labels =
# ifelse(classy != "-1", paste(labels, classy ), labels)) %>%
# dplyr::mutate(labels =
# ifelse(kids != "-1", paste(labels, kids ), labels)) %>%
# dplyr::select(business_id, labels)
# return(txfout_df)
# }
#if (!is.null(glbObsOut$mapFn)) obsOutFnlNslDf <- glbObsOut$mapFn(obsOutFnlNslDf); print(head(obsOutFnlNslDf))
glb_out_obs <- NULL # select from c(NULL : default to "new", "all", "new", "trn")
if (glb_is_classification && glb_is_binomial) {
# glbObsOut$vars[["Probability1"]] <-
# "%<d-% glbObsNew[, mygetPredictIds(glb_rsp_var, glbMdlId)$prob]"
# glbObsOut$vars[[glb_rsp_var_raw]] <-
# "%<d-% glb_map_rsp_var_to_raw(glbObsNew[,
# mygetPredictIds(glb_rsp_var, glbMdlId)$value])"
glbObsOut$vars[["Predictions"]] <-
"%<d-% glb_map_rsp_var_to_raw(glbObsNew[,
mygetPredictIds(glb_rsp_var, glbMdlFnlNslId)$value])"
} else {
# glbObsOut$vars[[glbFeatsId]] <-
# "%<d-% as.integer(gsub('Test#', '', glbObsNew[, glbFeatsId]))"
glbObsOut$vars[[glb_rsp_var]] <-
"%<d-% glbObsNew[, mygetPredictIds(glb_rsp_var, glbMdlFnlNslId)$value]"
# for (outVar in setdiff(glbFeatsExcludeLcl, glb_rsp_var_raw))
# glbObsOut$vars[[outVar]] <-
# paste0("%<d-% mean(glbObsAll[, \"", outVar, "\"], na.rm = TRUE)")
}
glbOutStackFnames <- NULL #: default
# c("ebayipads_txt_assoc1_out_bid1_stack.csv")
glbOut <- list(pfx = "CFPB-FtrEng-rest-03_")
# lclImageSampleSeed <- 129
glbOutDataVizFname <- NULL # choose from c(NULL, "<projectId>_obsall.csv")
glbChunks <- list(labels = c("set_global_options_wd","set_global_options"
,"import.data","inspect.data","scrub.data","transform.data"
,"extract.features"
,"extract.features.datetime","extract.features.image","extract.features.price"
,"extract.features.text","extract.features.string"
,"extract.features.end"
,"manage.missing.data","cluster.data","partition.data.training","select.features"
,"fit.models_0","fit.models_1","fit.models_2","fit.models_3"
,"fit.data.training_0","fit.data.training_1"
,"predict.data.new"
,"display.session.info"))
# To ensure that all chunks in this script are in glbChunks
if (!is.null(chkChunksLabels <- knitr::all_labels()) && # knitr::all_labels() doesn't work in console runs
!identical(chkChunksLabels, glbChunks$labels)) {
print(sprintf("setdiff(chkChunksLabels, glbChunks$labels): %s",
setdiff(chkChunksLabels, glbChunks$labels)))
print(sprintf("setdiff(glbChunks$labels, chkChunksLabels): %s",
setdiff(glbChunks$labels, chkChunksLabels)))
}
glbChunks[["first"]] <-"select.features"; #NULL #default: script will load envir from previous chunk
glbChunks[["last"]] <- NULL #"extract.features.end" #NULL #default: script will save envir at end of this chunk
glbChunks[["inpFilePathName"]] <- "data/CFPB-FtrEng-partition-data-training-02_partition.data.training.RData"
#mysavChunk(glbOut$pfx, glbChunks[["last"]]) # called from myevlChunk
# Inspect max OOB FP
#chkObsOOB <- subset(glbObsOOB, !label.fctr.All.X..rcv.glmnet.is.acc)
#chkObsOOBFP <- subset(chkObsOOB, label.fctr.All.X..rcv.glmnet == "left_eye_center") %>% dplyr::mutate(Probability1 = label.fctr.All.X..rcv.glmnet.prob) %>% select(-.src, -.pos, -x, -y) %>% lclgetfltout_df() %>% mutate(obj.distance = (((as.numeric(x) - left_eye_center_x.int) ^ 2) + ((as.numeric(y) - left_eye_center_y.int) ^ 2)) ^ 0.5) %>% dplyr::top_n(5, obj.distance) %>% dplyr::top_n(5, -patch.cor)
#
#newImgObs <- glbObsNew[(glbObsNew$ImageId == "Test#0001"), ]; print(newImgObs[which.max(newImgObs$label.fctr.Trn..rcv.glmnet.prob), ])
#OOBImgObs <- glbObsOOB[(glbObsOOB$ImageId == "Train#0003"), ]; print(OOBImgObs[which.max(OOBImgObs$label.fctr.All.X..rcv.glmnet.prob), ])
#mygetImage(which(glbObsAll[, glbFeatsId] == "Train#0003"), names(glbFeatsImage)[1], plot = TRUE, featHighlight = c("left_eye_center_x", "left_eye_center_y"), ovrlHighlight = c(66, 35))
# Depict process
glb_analytics_pn <- petrinet(name = "glb_analytics_pn",
trans_df = data.frame(id = 1:6,
name = c("data.training.all","data.new",
"model.selected","model.final",
"data.training.all.prediction","data.new.prediction"),
x=c( -5,-5,-15,-25,-25,-35),
y=c( -5, 5, 0, 0, -5, 5)
),
places_df=data.frame(id=1:4,
name=c("bgn","fit.data.training.all","predict.data.new","end"),
x=c( -0, -20, -30, -40),
y=c( 0, 0, 0, 0),
M0=c( 3, 0, 0, 0)
),
arcs_df = data.frame(
begin = c("bgn","bgn","bgn",
"data.training.all","model.selected","fit.data.training.all",
"fit.data.training.all","model.final",
"data.new","predict.data.new",
"data.training.all.prediction","data.new.prediction"),
end = c("data.training.all","data.new","model.selected",
"fit.data.training.all","fit.data.training.all","model.final",
"data.training.all.prediction","predict.data.new",
"predict.data.new","data.new.prediction",
"end","end")
))
#print(ggplot.petrinet(glb_analytics_pn))
print(ggplot.petrinet(glb_analytics_pn) + coord_flip())
## Loading required package: grid
glb_analytics_avl_objs <- NULL
glb_chunks_df <- myadd_chunk(NULL,
ifelse(is.null(glbChunks$first), "import.data", glbChunks$first))
## label step_major step_minor label_minor bgn end elapsed
## 1 select.features 1 0 0 6.502 NA NA
1.0: select features1.0: select features1.0: select features1.0: select features1.0: select features1.0: select features1.0: select features1.0: select features1.0: select features1.0: select features1.0: select features1.0: select features1.0: select features1.0: select features1.0: select features## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'NA'
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'NA'
## Warning in cor(data.matrix(entity_df[, sel_feats]), y =
## as.numeric(entity_df[, : the standard deviation is zero
## Loading required package: reshape2
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'NA'
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'NA'
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'NA'
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'NA'
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'NA'
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'NA'
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'NA'
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'NA'
## [1] "cor(Datereceived.date.fctr, Datesenttocompany.date.fctr)=1.0000"
## [1] "cor(CDisputed.fctr, Datereceived.date.fctr)=0.0091"
## [1] "cor(CDisputed.fctr, Datesenttocompany.date.fctr)=0.0091"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Datesenttocompany.date.fctr as highly correlated
## with Datereceived.date.fctr
## [1] "cor(Datereceived.juliandate, Datesenttocompany.juliandate)=1.0000"
## [1] "cor(CDisputed.fctr, Datereceived.juliandate)=0.0293"
## [1] "cor(CDisputed.fctr, Datesenttocompany.juliandate)=0.0293"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Datesenttocompany.juliandate as highly correlated
## with Datereceived.juliandate
## [1] "cor(Datereceived.last16.log1p, Datesenttocompany.last16.log1p)=1.0000"
## [1] "cor(CDisputed.fctr, Datereceived.last16.log1p)=0.0789"
## [1] "cor(CDisputed.fctr, Datesenttocompany.last16.log1p)=0.0789"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Datesenttocompany.last16.log1p as highly correlated
## with Datereceived.last16.log1p
## [1] "cor(Datereceived.last2.log1p, Datesenttocompany.last2.log1p)=1.0000"
## [1] "cor(CDisputed.fctr, Datereceived.last2.log1p)=0.0431"
## [1] "cor(CDisputed.fctr, Datesenttocompany.last2.log1p)=0.0431"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Datesenttocompany.last2.log1p as highly correlated
## with Datereceived.last2.log1p
## [1] "cor(Datereceived.last32.log1p, Datesenttocompany.last32.log1p)=1.0000"
## [1] "cor(CDisputed.fctr, Datereceived.last32.log1p)=0.0547"
## [1] "cor(CDisputed.fctr, Datesenttocompany.last32.log1p)=0.0547"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Datesenttocompany.last32.log1p as highly correlated
## with Datereceived.last32.log1p
## [1] "cor(Datereceived.last4.log1p, Datesenttocompany.last4.log1p)=1.0000"
## [1] "cor(CDisputed.fctr, Datereceived.last4.log1p)=0.0579"
## [1] "cor(CDisputed.fctr, Datesenttocompany.last4.log1p)=0.0579"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Datesenttocompany.last4.log1p as highly correlated
## with Datereceived.last4.log1p
## [1] "cor(Datereceived.last8.log1p, Datesenttocompany.last8.log1p)=1.0000"
## [1] "cor(CDisputed.fctr, Datereceived.last8.log1p)=0.0785"
## [1] "cor(CDisputed.fctr, Datesenttocompany.last8.log1p)=0.0785"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Datesenttocompany.last8.log1p as highly correlated
## with Datereceived.last8.log1p
## [1] "cor(Datereceived.month.fctr, Datesenttocompany.month.fctr)=1.0000"
## [1] "cor(CDisputed.fctr, Datereceived.month.fctr)=0.0286"
## [1] "cor(CDisputed.fctr, Datesenttocompany.month.fctr)=0.0286"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Datesenttocompany.month.fctr as highly correlated
## with Datereceived.month.fctr
## [1] "cor(Datereceived.wkday.fctr, Datesenttocompany.wkday.fctr)=1.0000"
## [1] "cor(CDisputed.fctr, Datereceived.wkday.fctr)=-0.0082"
## [1] "cor(CDisputed.fctr, Datesenttocompany.wkday.fctr)=-0.0082"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Datesenttocompany.wkday.fctr as highly correlated
## with Datereceived.wkday.fctr
## [1] "cor(Datereceived.wkend, Datesenttocompany.wkend)=1.0000"
## [1] "cor(CDisputed.fctr, Datereceived.wkend)=0.0182"
## [1] "cor(CDisputed.fctr, Datesenttocompany.wkend)=0.0182"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Datesenttocompany.wkend as highly correlated with
## Datereceived.wkend
## [1] "cor(Datereceived.year.fctr, Datesenttocompany.year.fctr)=1.0000"
## [1] "cor(CDisputed.fctr, Datereceived.year.fctr)=0.0315"
## [1] "cor(CDisputed.fctr, Datesenttocompany.year.fctr)=0.0315"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Datesenttocompany.year.fctr as highly correlated
## with Datereceived.year.fctr
## [1] "cor(Datereceived.juliandate, Datereceived.month.fctr)=0.9962"
## [1] "cor(CDisputed.fctr, Datereceived.juliandate)=0.0293"
## [1] "cor(CDisputed.fctr, Datereceived.month.fctr)=0.0286"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Datereceived.month.fctr as highly correlated with
## Datereceived.juliandate
## [1] "cor(ComplaintID, Datereceived.year.fctr)=0.9755"
## [1] "cor(CDisputed.fctr, ComplaintID)=0.0416"
## [1] "cor(CDisputed.fctr, Datereceived.year.fctr)=0.0315"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Datereceived.year.fctr as highly correlated with
## ComplaintID
## [1] "cor(Sent.Recd.Dys.log1p, Sent.Recd.Dys.root2)=0.9490"
## [1] "cor(CDisputed.fctr, Sent.Recd.Dys.log1p)=-0.0195"
## [1] "cor(CDisputed.fctr, Sent.Recd.Dys.root2)=-0.0190"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Sent.Recd.Dys.root2 as highly correlated with
## Sent.Recd.Dys.log1p
## [1] "cor(Sent.Recd.Dys.log1p, Sent.Recd.Dys.nexp)=-0.7997"
## [1] "cor(CDisputed.fctr, Sent.Recd.Dys.log1p)=-0.0195"
## [1] "cor(CDisputed.fctr, Sent.Recd.Dys.nexp)=0.0145"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Sent.Recd.Dys.nexp as highly correlated with
## Sent.Recd.Dys.log1p
## cor.y exclude.as.feat cor.y.abs
## Datereceived.last16.log1p 0.0788694683 0 0.0788694683
## Datesenttocompany.last16.log1p 0.0788694683 0 0.0788694683
## Datereceived.last8.log1p 0.0785261900 0 0.0785261900
## Datesenttocompany.last8.log1p 0.0785261900 0 0.0785261900
## Datereceived.last4.log1p 0.0578731600 0 0.0578731600
## Datesenttocompany.last4.log1p 0.0578731600 0 0.0578731600
## Datereceived.last32.log1p 0.0546615200 0 0.0546615200
## Datesenttocompany.last32.log1p 0.0546615200 0 0.0546615200
## Datereceived.last2.log1p 0.0430861964 0 0.0430861964
## Datesenttocompany.last2.log1p 0.0430861964 0 0.0430861964
## ComplaintID 0.0415642342 0 0.0415642342
## spcPrd.fctr 0.0409122512 0 0.0409122512
## Datereceived.POSIX 0.0389719499 1 0.0389719499
## Datesenttocompany.POSIX 0.0389719499 1 0.0389719499
## gCConsent.fctr 0.0345921628 0 0.0345921628
## Datereceived.year.fctr 0.0315331012 0 0.0315331012
## Datesenttocompany.year.fctr 0.0315331012 0 0.0315331012
## Datereceived.juliandate 0.0292889018 0 0.0292889018
## Datesenttocompany.juliandate 0.0292889018 0 0.0292889018
## Datereceived.month.fctr 0.0286043882 0 0.0286043882
## Datesenttocompany.month.fctr 0.0286043882 0 0.0286043882
## .pos 0.0195213317 0 0.0195213317
## Datereceived.zoo 0.0183915709 1 0.0183915709
## Datesenttocompany.zoo 0.0183915709 1 0.0183915709
## Datereceived.wkend 0.0181678194 0 0.0181678194
## Datesenttocompany.wkend 0.0181678194 0 0.0181678194
## Sent.Recd.Dys.nexp 0.0145148848 0 0.0145148848
## Datereceived.date.fctr 0.0091129069 0 0.0091129069
## Datesenttocompany.date.fctr 0.0091129069 0 0.0091129069
## .rnorm 0.0072224455 0 0.0072224455
## Rgn.Dvn.fctr 0.0031221830 0 0.0031221830
## Timelyresponse.fctr -0.0009823835 0 0.0009823835
## gTags.fctr -0.0023288417 0 0.0023288417
## Datereceived.wkday.fctr -0.0081792652 0 0.0081792652
## Datesenttocompany.wkday.fctr -0.0081792652 0 0.0081792652
## Sent.Recd.Dys -0.0155591913 0 0.0155591913
## Sent.Recd.Dys.root2 -0.0190489018 0 0.0190489018
## Sent.Recd.Dys.log1p -0.0194971993 0 0.0194971993
## Channel.fctr -0.0332581054 0 0.0332581054
## Response.fctr -0.0767900572 0 0.0767900572
## .category NA 1 NA
## Datereceived.day.minutes NA 1 NA
## Datereceived.hlday NA 0 NA
## Datereceived.hour.fctr NA 0 NA
## Datereceived.minute.fctr NA 0 NA
## Datereceived.second.fctr NA 0 NA
## Datesenttocompany.day.minutes NA 1 NA
## Datesenttocompany.hlday NA 0 NA
## Datesenttocompany.hour.fctr NA 0 NA
## Datesenttocompany.minute.fctr NA 0 NA
## Datesenttocompany.second.fctr NA 0 NA
## cor.high.X freqRatio
## Datereceived.last16.log1p <NA> 1.018160
## Datesenttocompany.last16.log1p Datereceived.last16.log1p 1.018160
## Datereceived.last8.log1p <NA> 2.536909
## Datesenttocompany.last8.log1p Datereceived.last8.log1p 2.536909
## Datereceived.last4.log1p <NA> 5.615241
## Datesenttocompany.last4.log1p Datereceived.last4.log1p 5.615241
## Datereceived.last32.log1p <NA> 2.963513
## Datesenttocompany.last32.log1p Datereceived.last32.log1p 2.963513
## Datereceived.last2.log1p <NA> 11.929177
## Datesenttocompany.last2.log1p Datereceived.last2.log1p 11.929177
## ComplaintID <NA> 1.000000
## spcPrd.fctr <NA> 1.246899
## Datereceived.POSIX <NA> 2.289062
## Datesenttocompany.POSIX <NA> 2.289062
## gCConsent.fctr <NA> 2.810347
## Datereceived.year.fctr ComplaintID 1.259131
## Datesenttocompany.year.fctr Datereceived.year.fctr 1.259131
## Datereceived.juliandate <NA> 1.949541
## Datesenttocompany.juliandate Datereceived.juliandate 1.949541
## Datereceived.month.fctr Datereceived.juliandate 1.059162
## Datesenttocompany.month.fctr Datereceived.month.fctr 1.059162
## .pos <NA> 1.000000
## Datereceived.zoo <NA> 2.294574
## Datesenttocompany.zoo <NA> 2.294574
## Datereceived.wkend <NA> 8.037852
## Datesenttocompany.wkend Datereceived.wkend 8.037852
## Sent.Recd.Dys.nexp Sent.Recd.Dys.log1p 3.331222
## Datereceived.date.fctr <NA> 1.100034
## Datesenttocompany.date.fctr Datereceived.date.fctr 1.100034
## .rnorm <NA> 1.000000
## Rgn.Dvn.fctr <NA> 1.609449
## Timelyresponse.fctr <NA> 63.659607
## gTags.fctr <NA> 14.351767
## Datereceived.wkday.fctr <NA> 1.013234
## Datesenttocompany.wkday.fctr Datereceived.wkday.fctr 1.013234
## Sent.Recd.Dys <NA> 3.331222
## Sent.Recd.Dys.root2 Sent.Recd.Dys.log1p 3.331222
## Sent.Recd.Dys.log1p <NA> 3.331222
## Channel.fctr <NA> 8.353941
## Response.fctr <NA> 9.220592
## .category <NA> 0.000000
## Datereceived.day.minutes <NA> 0.000000
## Datereceived.hlday <NA> 0.000000
## Datereceived.hour.fctr <NA> 0.000000
## Datereceived.minute.fctr <NA> 0.000000
## Datereceived.second.fctr <NA> 0.000000
## Datesenttocompany.day.minutes <NA> 0.000000
## Datesenttocompany.hlday <NA> 0.000000
## Datesenttocompany.hour.fctr <NA> 0.000000
## Datesenttocompany.minute.fctr <NA> 0.000000
## Datesenttocompany.second.fctr <NA> 0.000000
## percentUnique zeroVar nzv
## Datereceived.last16.log1p 1.403837e-02 FALSE FALSE
## Datesenttocompany.last16.log1p 1.403837e-02 FALSE FALSE
## Datereceived.last8.log1p 1.169864e-02 FALSE FALSE
## Datesenttocompany.last8.log1p 1.169864e-02 FALSE FALSE
## Datereceived.last4.log1p 9.358914e-03 FALSE FALSE
## Datesenttocompany.last4.log1p 9.358914e-03 FALSE FALSE
## Datereceived.last32.log1p 1.637810e-02 FALSE FALSE
## Datesenttocompany.last32.log1p 1.637810e-02 FALSE FALSE
## Datereceived.last2.log1p 9.358914e-03 FALSE FALSE
## Datesenttocompany.last2.log1p 9.358914e-03 FALSE FALSE
## ComplaintID 1.000000e+02 FALSE FALSE
## spcPrd.fctr 1.871783e-02 FALSE FALSE
## Datereceived.POSIX 3.907347e+00 FALSE FALSE
## Datesenttocompany.POSIX 3.907347e+00 FALSE FALSE
## gCConsent.fctr 9.358914e-03 FALSE FALSE
## Datereceived.year.fctr 1.169864e-02 FALSE FALSE
## Datesenttocompany.year.fctr 1.169864e-02 FALSE FALSE
## Datereceived.juliandate 8.563407e-01 FALSE FALSE
## Datesenttocompany.juliandate 8.563407e-01 FALSE FALSE
## Datereceived.month.fctr 2.807674e-02 FALSE FALSE
## Datesenttocompany.month.fctr 2.807674e-02 FALSE FALSE
## .pos 1.000000e+02 FALSE FALSE
## Datereceived.zoo 3.752925e+00 FALSE FALSE
## Datesenttocompany.zoo 3.752925e+00 FALSE FALSE
## Datereceived.wkend 4.679457e-03 FALSE FALSE
## Datesenttocompany.wkend 4.679457e-03 FALSE FALSE
## Sent.Recd.Dys.nexp 7.650912e-01 FALSE FALSE
## Datereceived.date.fctr 1.169864e-02 FALSE FALSE
## Datesenttocompany.date.fctr 1.169864e-02 FALSE FALSE
## .rnorm 9.987365e+01 FALSE FALSE
## Rgn.Dvn.fctr 2.339729e-02 FALSE FALSE
## Timelyresponse.fctr 4.679457e-03 FALSE TRUE
## gTags.fctr 9.358914e-03 FALSE FALSE
## Datereceived.wkday.fctr 1.637810e-02 FALSE FALSE
## Datesenttocompany.wkday.fctr 1.637810e-02 FALSE FALSE
## Sent.Recd.Dys 7.650912e-01 FALSE FALSE
## Sent.Recd.Dys.root2 7.650912e-01 FALSE FALSE
## Sent.Recd.Dys.log1p 7.650912e-01 FALSE FALSE
## Channel.fctr 1.403837e-02 FALSE FALSE
## Response.fctr 1.403837e-02 FALSE FALSE
## .category 2.339729e-03 TRUE TRUE
## Datereceived.day.minutes 2.339729e-03 TRUE TRUE
## Datereceived.hlday 2.339729e-03 TRUE TRUE
## Datereceived.hour.fctr 2.339729e-03 TRUE TRUE
## Datereceived.minute.fctr 2.339729e-03 TRUE TRUE
## Datereceived.second.fctr 2.339729e-03 TRUE TRUE
## Datesenttocompany.day.minutes 2.339729e-03 TRUE TRUE
## Datesenttocompany.hlday 2.339729e-03 TRUE TRUE
## Datesenttocompany.hour.fctr 2.339729e-03 TRUE TRUE
## Datesenttocompany.minute.fctr 2.339729e-03 TRUE TRUE
## Datesenttocompany.second.fctr 2.339729e-03 TRUE TRUE
## is.cor.y.abs.low
## Datereceived.last16.log1p FALSE
## Datesenttocompany.last16.log1p FALSE
## Datereceived.last8.log1p FALSE
## Datesenttocompany.last8.log1p FALSE
## Datereceived.last4.log1p FALSE
## Datesenttocompany.last4.log1p FALSE
## Datereceived.last32.log1p FALSE
## Datesenttocompany.last32.log1p FALSE
## Datereceived.last2.log1p FALSE
## Datesenttocompany.last2.log1p FALSE
## ComplaintID FALSE
## spcPrd.fctr FALSE
## Datereceived.POSIX FALSE
## Datesenttocompany.POSIX FALSE
## gCConsent.fctr FALSE
## Datereceived.year.fctr FALSE
## Datesenttocompany.year.fctr FALSE
## Datereceived.juliandate FALSE
## Datesenttocompany.juliandate FALSE
## Datereceived.month.fctr FALSE
## Datesenttocompany.month.fctr FALSE
## .pos FALSE
## Datereceived.zoo FALSE
## Datesenttocompany.zoo FALSE
## Datereceived.wkend FALSE
## Datesenttocompany.wkend FALSE
## Sent.Recd.Dys.nexp FALSE
## Datereceived.date.fctr FALSE
## Datesenttocompany.date.fctr FALSE
## .rnorm FALSE
## Rgn.Dvn.fctr TRUE
## Timelyresponse.fctr TRUE
## gTags.fctr TRUE
## Datereceived.wkday.fctr FALSE
## Datesenttocompany.wkday.fctr FALSE
## Sent.Recd.Dys FALSE
## Sent.Recd.Dys.root2 FALSE
## Sent.Recd.Dys.log1p FALSE
## Channel.fctr FALSE
## Response.fctr FALSE
## .category NA
## Datereceived.day.minutes NA
## Datereceived.hlday NA
## Datereceived.hour.fctr NA
## Datereceived.minute.fctr NA
## Datereceived.second.fctr NA
## Datesenttocompany.day.minutes NA
## Datesenttocompany.hlday NA
## Datesenttocompany.hour.fctr NA
## Datesenttocompany.minute.fctr NA
## Datesenttocompany.second.fctr NA
## Warning in myplot_scatter(plt_feats_df, "percentUnique", "freqRatio",
## colorcol_name = "nzv", : converting nzv to class:factor
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## cor.y exclude.as.feat cor.y.abs
## Timelyresponse.fctr -0.0009823835 0 0.0009823835
## .category NA 1 NA
## Datereceived.day.minutes NA 1 NA
## Datereceived.hlday NA 0 NA
## Datereceived.hour.fctr NA 0 NA
## Datereceived.minute.fctr NA 0 NA
## Datereceived.second.fctr NA 0 NA
## Datesenttocompany.day.minutes NA 1 NA
## Datesenttocompany.hlday NA 0 NA
## Datesenttocompany.hour.fctr NA 0 NA
## Datesenttocompany.minute.fctr NA 0 NA
## Datesenttocompany.second.fctr NA 0 NA
## cor.high.X freqRatio percentUnique zeroVar
## Timelyresponse.fctr <NA> 63.65961 0.004679457 FALSE
## .category <NA> 0.00000 0.002339729 TRUE
## Datereceived.day.minutes <NA> 0.00000 0.002339729 TRUE
## Datereceived.hlday <NA> 0.00000 0.002339729 TRUE
## Datereceived.hour.fctr <NA> 0.00000 0.002339729 TRUE
## Datereceived.minute.fctr <NA> 0.00000 0.002339729 TRUE
## Datereceived.second.fctr <NA> 0.00000 0.002339729 TRUE
## Datesenttocompany.day.minutes <NA> 0.00000 0.002339729 TRUE
## Datesenttocompany.hlday <NA> 0.00000 0.002339729 TRUE
## Datesenttocompany.hour.fctr <NA> 0.00000 0.002339729 TRUE
## Datesenttocompany.minute.fctr <NA> 0.00000 0.002339729 TRUE
## Datesenttocompany.second.fctr <NA> 0.00000 0.002339729 TRUE
## nzv is.cor.y.abs.low
## Timelyresponse.fctr TRUE TRUE
## .category TRUE NA
## Datereceived.day.minutes TRUE NA
## Datereceived.hlday TRUE NA
## Datereceived.hour.fctr TRUE NA
## Datereceived.minute.fctr TRUE NA
## Datereceived.second.fctr TRUE NA
## Datesenttocompany.day.minutes TRUE NA
## Datesenttocompany.hlday TRUE NA
## Datesenttocompany.hour.fctr TRUE NA
## Datesenttocompany.minute.fctr TRUE NA
## Datesenttocompany.second.fctr TRUE NA
## [1] "numeric data missing in : "
## CDisputed.fctr
## 3229
## [1] "numeric data w/ 0s in : "
## Sent.Recd.Dys Sent.Recd.Dys.log1p
## 19992 363
## Sent.Recd.Dys.root2 Datereceived.wkday.fctr
## 363 2152
## Datereceived.wkend Datereceived.hlday
## 40862 45969
## Datereceived.hour.fctr Datereceived.minute.fctr
## 45969 45969
## Datereceived.second.fctr Datereceived.day.minutes
## 45969 45969
## Datesenttocompany.wkday.fctr Datesenttocompany.wkend
## 2152 40862
## Datesenttocompany.hlday Datesenttocompany.hour.fctr
## 45969 45969
## Datesenttocompany.minute.fctr Datesenttocompany.second.fctr
## 45969 45969
## Datesenttocompany.day.minutes Datereceived.last2.log1p
## 45969 42624
## Datereceived.last4.log1p Datereceived.last8.log1p
## 39359 33248
## Datereceived.last16.log1p Datereceived.last32.log1p
## 22748 9516
## Datesenttocompany.last2.log1p Datesenttocompany.last4.log1p
## 42624 39359
## Datesenttocompany.last8.log1p Datesenttocompany.last16.log1p
## 33248 22748
## Datesenttocompany.last32.log1p
## 9516
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## Datereceived Product
## 0 0
## Subproduct Issue
## 0 0
## Subissue Consumercomplaintnarrative
## 25343 35711
## Companypublicresponse Company
## 38356 0
## State ZIPcode
## 296 297
## Tags Consumerconsentprovided
## 40181 1563
## Submittedvia Datesenttocompany
## 0 0
## Companyresponsetoconsumer Timelyresponse
## 0 0
## Consumerdisputed spcPrd
## 3229 0
## Rgn.Dvn gTags
## 0 0
## gCConsent Channel
## 0 0
## Response .lcn
## 0 3229
## [1] "glb_feats_df:"
## [1] 51 12
## id exclude.as.feat rsp_var
## CDisputed.fctr CDisputed.fctr TRUE TRUE
## id cor.y exclude.as.feat cor.y.abs
## ComplaintID ComplaintID 0.04156423 FALSE 0.04156423
## CDisputed.fctr CDisputed.fctr NA TRUE NA
## cor.high.X freqRatio percentUnique zeroVar nzv
## ComplaintID <NA> 1 100 FALSE FALSE
## CDisputed.fctr <NA> NA NA NA NA
## is.cor.y.abs.low interaction.feat shapiro.test.p.value
## ComplaintID FALSE NA 4.52805e-40
## CDisputed.fctr NA NA NA
## rsp_var_raw id_var rsp_var
## ComplaintID FALSE TRUE NA
## CDisputed.fctr NA NA TRUE
## [1] "glb_feats_df vs. glbObsAll: "
## character(0)
## [1] "glbObsAll vs. glb_feats_df: "
## character(0)
## label step_major step_minor label_minor bgn end elapsed
## 1 select.features 1 0 0 6.502 22.459 15.957
## 2 fit.models 2 0 0 22.460 NA NA
2.0: fit modelsfit.models_0_chunk_df <- myadd_chunk(NULL, "fit.models_0_bgn", label.minor = "setup")
## label step_major step_minor label_minor bgn end elapsed
## 1 fit.models_0_bgn 1 0 setup 22.794 NA NA
# load(paste0(glbOut$pfx, "dsk.RData"))
glbgetModelSelectFormula <- function() {
model_evl_terms <- c(NULL)
# min.aic.fit might not be avl
lclMdlEvlCriteria <-
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)]
for (metric in lclMdlEvlCriteria)
model_evl_terms <- c(model_evl_terms,
ifelse(length(grep("max", metric)) > 0, "-", "+"), metric)
if (glb_is_classification && glb_is_binomial)
model_evl_terms <- c(model_evl_terms, "-", "opt.prob.threshold.OOB")
model_sel_frmla <- as.formula(paste(c("~ ", model_evl_terms), collapse = " "))
return(model_sel_frmla)
}
glbgetDisplayModelsDf <- function() {
dsp_models_cols <- c("id",
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)],
grep("opt.", names(glb_models_df), fixed = TRUE, value = TRUE))
dsp_models_df <-
#orderBy(glbgetModelSelectFormula(), glb_models_df)[, c("id", glbMdlMetricsEval)]
orderBy(glbgetModelSelectFormula(), glb_models_df)[, dsp_models_cols]
nCvMdl <- sapply(glb_models_lst, function(mdl) nrow(mdl$results))
nParams <- sapply(glb_models_lst, function(mdl) ifelse(mdl$method == "custom", 0,
nrow(subset(modelLookup(mdl$method), parameter != "parameter"))))
# nCvMdl <- nCvMdl[names(nCvMdl) != "avNNet"]
# nParams <- nParams[names(nParams) != "avNNet"]
if (length(cvMdlProblems <- nCvMdl[nCvMdl <= nParams]) > 0) {
print("Cross Validation issues:")
warning("Cross Validation issues:")
print(cvMdlProblems)
}
pltMdls <- setdiff(names(nCvMdl), names(cvMdlProblems))
pltMdls <- setdiff(pltMdls, names(nParams[nParams == 0]))
# length(pltMdls) == 21
png(paste0(glbOut$pfx, "bestTune.png"), width = 480 * 2, height = 480 * 4)
grid.newpage()
pushViewport(viewport(layout = grid.layout(ceiling(length(pltMdls) / 2.0), 2)))
pltIx <- 1
for (mdlId in pltMdls) {
print(ggplot(glb_models_lst[[mdlId]], highBestTune = TRUE) + labs(title = mdlId),
vp = viewport(layout.pos.row = ceiling(pltIx / 2.0),
layout.pos.col = ((pltIx - 1) %% 2) + 1))
pltIx <- pltIx + 1
}
dev.off()
if (all(row.names(dsp_models_df) != dsp_models_df$id))
row.names(dsp_models_df) <- dsp_models_df$id
return(dsp_models_df)
}
#glbgetDisplayModelsDf()
glb_get_predictions <- function(df, mdl_id, rsp_var, prob_threshold_def=NULL, verbose=FALSE) {
mdl <- glb_models_lst[[mdl_id]]
clmnNames <- mygetPredictIds(rsp_var, mdl_id)
predct_var_name <- clmnNames$value
predct_prob_var_name <- clmnNames$prob
predct_accurate_var_name <- clmnNames$is.acc
predct_error_var_name <- clmnNames$err
predct_erabs_var_name <- clmnNames$err.abs
if (glb_is_regression) {
df[, predct_var_name] <- predict(mdl, newdata=df, type="raw")
if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_var_name) +
facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
stat_smooth(method="glm"))
df[, predct_error_var_name] <- df[, predct_var_name] - df[, glb_rsp_var]
if (verbose) print(myplot_scatter(df, predct_var_name, predct_error_var_name) +
#facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
stat_smooth(method="auto"))
if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_error_var_name) +
#facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
stat_smooth(method="glm"))
df[, predct_erabs_var_name] <- abs(df[, predct_error_var_name])
if (verbose) print(head(orderBy(reformulate(c("-", predct_erabs_var_name)), df)))
df[, predct_accurate_var_name] <- (df[, glb_rsp_var] == df[, predct_var_name])
}
if (glb_is_classification && glb_is_binomial) {
prob_threshold <- glb_models_df[glb_models_df$id == mdl_id,
"opt.prob.threshold.OOB"]
if (is.null(prob_threshold) || is.na(prob_threshold)) {
warning("Using default probability threshold: ", prob_threshold_def)
if (is.null(prob_threshold <- prob_threshold_def))
stop("Default probability threshold is NULL")
}
df[, predct_prob_var_name] <- predict(mdl, newdata = df, type = "prob")[, 2]
df[, predct_var_name] <-
factor(levels(df[, glb_rsp_var])[
(df[, predct_prob_var_name] >=
prob_threshold) * 1 + 1], levels(df[, glb_rsp_var]))
# if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_var_name) +
# facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
# stat_smooth(method="glm"))
df[, predct_error_var_name] <- df[, predct_var_name] != df[, glb_rsp_var]
# if (verbose) print(myplot_scatter(df, predct_var_name, predct_error_var_name) +
# #facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
# stat_smooth(method="auto"))
# if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_error_var_name) +
# #facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
# stat_smooth(method="glm"))
# if prediction is a TP (true +ve), measure distance from 1.0
tp <- which((df[, predct_var_name] == df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[2]))
df[tp, predct_erabs_var_name] <- abs(1 - df[tp, predct_prob_var_name])
#rowIx <- which.max(df[tp, predct_erabs_var_name]); df[tp, c(glbFeatsId, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a TN (true -ve), measure distance from 0.0
tn <- which((df[, predct_var_name] == df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[1]))
df[tn, predct_erabs_var_name] <- abs(0 - df[tn, predct_prob_var_name])
#rowIx <- which.max(df[tn, predct_erabs_var_name]); df[tn, c(glbFeatsId, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a FP (flse +ve), measure distance from 0.0
fp <- which((df[, predct_var_name] != df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[2]))
df[fp, predct_erabs_var_name] <- abs(0 - df[fp, predct_prob_var_name])
#rowIx <- which.max(df[fp, predct_erabs_var_name]); df[fp, c(glbFeatsId, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a FN (flse -ve), measure distance from 1.0
fn <- which((df[, predct_var_name] != df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[1]))
df[fn, predct_erabs_var_name] <- abs(1 - df[fn, predct_prob_var_name])
#rowIx <- which.max(df[fn, predct_erabs_var_name]); df[fn, c(glbFeatsId, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
if (verbose) print(head(orderBy(reformulate(c("-", predct_erabs_var_name)), df)))
df[, predct_accurate_var_name] <- (df[, glb_rsp_var] == df[, predct_var_name])
}
if (glb_is_classification && !glb_is_binomial) {
df[, predct_var_name] <- predict(mdl, newdata = df, type = "raw")
probCls <- predict(mdl, newdata = df, type = "prob")
df[, predct_prob_var_name] <- NA
for (cls in names(probCls)) {
mask <- (df[, predct_var_name] == cls)
df[mask, predct_prob_var_name] <- probCls[mask, cls]
}
if (verbose) print(myplot_histogram(df, predct_prob_var_name,
fill_col_name = predct_var_name))
if (verbose) print(myplot_histogram(df, predct_prob_var_name,
facet_frmla = paste0("~", glb_rsp_var)))
df[, predct_error_var_name] <- df[, predct_var_name] != df[, glb_rsp_var]
# if prediction is erroneous, measure predicted class prob from actual class prob
df[, predct_erabs_var_name] <- 0
for (cls in names(probCls)) {
mask <- (df[, glb_rsp_var] == cls) & (df[, predct_error_var_name])
df[mask, predct_erabs_var_name] <- probCls[mask, cls]
}
df[, predct_accurate_var_name] <- (df[, glb_rsp_var] == df[, predct_var_name])
}
return(df)
}
if (glb_is_classification && glb_is_binomial &&
(length(unique(glbObsFit[, glb_rsp_var])) < 2))
stop("glbObsFit$", glb_rsp_var, ": contains less than 2 unique values: ",
paste0(unique(glbObsFit[, glb_rsp_var]), collapse=", "))
max_cor_y_x_vars <- orderBy(~ -cor.y.abs,
subset(glb_feats_df, (exclude.as.feat == 0) & !nzv & !is.cor.y.abs.low &
is.na(cor.high.X)))[1:2, "id"]
max_cor_y_x_vars <- max_cor_y_x_vars[!is.na(max_cor_y_x_vars)]
if (length(max_cor_y_x_vars) < 2)
max_cor_y_x_vars <- union(max_cor_y_x_vars, ".pos")
if (!is.null(glb_Baseline_mdl_var)) {
if ((max_cor_y_x_vars[1] != glb_Baseline_mdl_var) &
(glb_feats_df[glb_feats_df$id == max_cor_y_x_vars[1], "cor.y.abs"] >
glb_feats_df[glb_feats_df$id == glb_Baseline_mdl_var, "cor.y.abs"]))
stop(max_cor_y_x_vars[1], " has a higher correlation with ", glb_rsp_var,
" than the Baseline var: ", glb_Baseline_mdl_var)
}
glb_model_type <- ifelse(glb_is_regression, "regression", "classification")
# Model specs
# c("id.prefix", "method", "type",
# # trainControl params
# "preProc.method", "cv.n.folds", "cv.n.repeats", "summary.fn",
# # train params
# "metric", "metric.maximize", "tune.df")
# Baseline
if (!is.null(glb_Baseline_mdl_var)) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Baseline"), major.inc = FALSE,
label.minor = "mybaseln_classfr")
ret_lst <- myfit_mdl(mdl_id="Baseline",
model_method="mybaseln_classfr",
indepVar=glb_Baseline_mdl_var,
rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
}
# Most Frequent Outcome "MFO" model: mean(y) for regression
# Not using caret's nullModel since model stats not avl
# Cannot use rpart for multinomial classification since it predicts non-MFO
if (glb_is_classification) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "MFO"), major.inc = FALSE,
label.minor = "myMFO_classfr")
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "MFO", type = glb_model_type, trainControl.method = "none",
train.method = ifelse(glb_is_regression, "lm", "myMFO_classfr"))),
indepVar = ".rnorm", rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
# "random" model - only for classification;
# none needed for regression since it is same as MFO
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Random"), major.inc = FALSE,
label.minor = "myrandom_classfr")
#stop(here"); glb2Sav(); all.equal(glb_models_df, sav_models_df)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Random", type = glb_model_type, trainControl.method = "none",
train.method = "myrandom_classfr")),
indepVar = ".rnorm", rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
## label step_major step_minor label_minor bgn end
## 1 fit.models_0_bgn 1 0 setup 22.794 22.828
## 2 fit.models_0_MFO 1 1 myMFO_classfr 22.828 NA
## elapsed
## 1 0.034
## 2 NA
## [1] "myfit_mdl: enter: 0.001000 secs"
## [1] "myfit_mdl: fitting model: MFO###myMFO_classfr"
## [1] " indepVar: .rnorm"
## [1] "myfit_mdl: setup complete: 0.378000 secs"
## Fitting parameter = none on full training set
## [1] "in MFO.Classifier$fit"
## [1] "unique.vals:"
## [1] N Y
## Levels: N Y
## [1] "unique.prob:"
## y
## N Y
## 0.7743332 0.2256668
## [1] "MFO.val:"
## [1] "N"
## [1] "myfit_mdl: train complete: 0.703000 secs"
## parameter
## 1 none
## Length Class Mode
## unique.vals 2 factor numeric
## unique.prob 2 table numeric
## MFO.val 1 -none- character
## x.names 1 -none- character
## xNames 1 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## Warning in if (mdl_specs_lst[["train.method"]] %in% c("glm"))
## mydisplayOutliers(mdl, : the condition has length > 1 and only the first
## element will be used
## [1] "myfit_mdl: train diagnostics complete: 0.705000 secs"
## Loading required namespace: pROC
## [1] "entr MFO.Classifier$predict"
## [1] "exit MFO.Classifier$predict"
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
## [1] "in MFO.Classifier$prob"
## N Y
## 1 0.7743332 0.2256668
## 2 0.7743332 0.2256668
## 3 0.7743332 0.2256668
## 4 0.7743332 0.2256668
## 5 0.7743332 0.2256668
## 6 0.7743332 0.2256668
## Loading required package: sqldf
## Loading required package: gsubfn
## Loading required package: proto
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
## dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
## Referenced from: /Library/Frameworks/R.framework/Resources/modules//R_X11.so
## Reason: image not found
## Could not load tcltk. Will use slower R code instead.
## Loading required package: RSQLite
## Loading required package: DBI
## [1] "mypredict_mdl: maxMetricDf:"
## threshold f.score Accuracy g.score
## 6 0.25 0 0.7743332 0
## 7 0.30 0 0.7743332 0
## 8 0.35 0 0.7743332 0
## 9 0.40 0 0.7743332 0
## 10 0.45 0 0.7743332 0
## 11 0.50 0 0.7743332 0
## 12 0.55 0 0.7743332 0
## 13 0.60 0 0.7743332 0
## 14 0.65 0 0.7743332 0
## 15 0.70 0 0.7743332 0
## 16 0.75 0 0.7743332 0
## 17 0.80 0 0.7743332 0
## 18 0.85 0 0.7743332 0
## 19 0.90 0 0.7743332 0
## 20 0.95 0 0.7743332 0
## 21 1.00 0 0.7743332 0
## Prediction
## Reference N Y
## N 26476 0
## Y 7716 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7743332 0.0000000 0.7698645 0.7787552 0.7743332
## AccuracyPValue McnemarPValue
## 0.5030525 0.0000000
## [1] "entr MFO.Classifier$predict"
## [1] "exit MFO.Classifier$predict"
## [1] "in MFO.Classifier$prob"
## N Y
## 1 0.7743332 0.2256668
## 2 0.7743332 0.2256668
## 3 0.7743332 0.2256668
## 4 0.7743332 0.2256668
## 5 0.7743332 0.2256668
## 6 0.7743332 0.2256668
## [1] "mypredict_mdl: maxMetricDf:"
## threshold f.score Accuracy g.score
## 6 0.25 0 0.7743332 0
## 7 0.30 0 0.7743332 0
## 8 0.35 0 0.7743332 0
## 9 0.40 0 0.7743332 0
## 10 0.45 0 0.7743332 0
## 11 0.50 0 0.7743332 0
## 12 0.55 0 0.7743332 0
## 13 0.60 0 0.7743332 0
## 14 0.65 0 0.7743332 0
## 15 0.70 0 0.7743332 0
## 16 0.75 0 0.7743332 0
## 17 0.80 0 0.7743332 0
## 18 0.85 0 0.7743332 0
## 19 0.90 0 0.7743332 0
## 20 0.95 0 0.7743332 0
## 21 1.00 0 0.7743332 0
## Prediction
## Reference N Y
## N 6619 0
## Y 1929 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7743332 0.0000000 0.7653204 0.7831586 0.7743332
## AccuracyPValue McnemarPValue
## 0.5061047 0.0000000
## [1] "myfit_mdl: predict complete: 19.105000 secs"
## id feats max.nTuningRuns min.elapsedtime.everything
## 1 MFO###myMFO_classfr .rnorm 0 0.296
## min.elapsedtime.final max.AUCpROC.fit max.Sens.fit max.Spec.fit
## 1 0.027 0.5 1 0
## max.AUCROCR.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.5 0 0.7743332
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7698645 0.7787552 0
## min.log.loss.mean.fit max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB
## 1 0.5339873 0.5 1 0
## max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.5 0.5 0 0.7743332
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7653204 0.7831586 0
## min.log.loss.mean.OOB
## 1 0.5339873
## [1] "in MFO.Classifier$prob"
## N Y
## 1 0.7743332 0.2256668
## 2 0.7743332 0.2256668
## 3 0.7743332 0.2256668
## 4 0.7743332 0.2256668
## 5 0.7743332 0.2256668
## 6 0.7743332 0.2256668
## [1] "myfit_mdl: exit: 19.397000 secs"
## label step_major step_minor label_minor bgn end
## 2 fit.models_0_MFO 1 1 myMFO_classfr 22.828 42.228
## 3 fit.models_0_Random 1 2 myrandom_classfr 42.229 NA
## elapsed
## 2 19.401
## 3 NA
## [1] "myfit_mdl: enter: 0.000000 secs"
## [1] "myfit_mdl: fitting model: Random###myrandom_classfr"
## [1] " indepVar: .rnorm"
## [1] "myfit_mdl: setup complete: 0.365000 secs"
## Fitting parameter = none on full training set
## [1] "myfit_mdl: train complete: 0.707000 secs"
## parameter
## 1 none
## Length Class Mode
## unique.vals 2 factor numeric
## unique.prob 2 table numeric
## xNames 1 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## Warning in if (mdl_specs_lst[["train.method"]] %in% c("glm"))
## mydisplayOutliers(mdl, : the condition has length > 1 and only the first
## element will be used
## [1] "myfit_mdl: train diagnostics complete: 0.710000 secs"
## [1] "in Random.Classifier$prob"
## Prediction
## Reference N Y
## N 26476 0
## Y 7716 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7743332 0.0000000 0.7698645 0.7787552 0.7743332
## AccuracyPValue McnemarPValue
## 0.5030525 0.0000000
## [1] "in Random.Classifier$prob"
## Prediction
## Reference N Y
## N 6619 0
## Y 1929 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7743332 0.0000000 0.7653204 0.7831586 0.7743332
## AccuracyPValue McnemarPValue
## 0.5061047 0.0000000
## [1] "myfit_mdl: predict complete: 32.779000 secs"
## id feats max.nTuningRuns
## 1 Random###myrandom_classfr .rnorm 0
## min.elapsedtime.everything min.elapsedtime.final max.AUCpROC.fit
## 1 0.301 0.039 0.5007904
## max.Sens.fit max.Spec.fit max.AUCROCR.fit opt.prob.threshold.fit
## 1 0.7720577 0.2295231 0.5040612 0.8
## max.f.score.fit max.Accuracy.fit max.AccuracyLower.fit
## 1 0 0.7743332 0.7698645
## max.AccuracyUpper.fit max.Kappa.fit min.log.loss.mean.fit
## 1 0.7787552 0 0.683165
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.4927907 0.7709624 0.214619 0.5080821
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.8 0 0.7743332
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7653204 0.7831586 0
## min.log.loss.mean.OOB
## 1 0.681975
## [1] "in Random.Classifier$prob"
## [1] "myfit_mdl: exit: 51.780000 secs"
# Max.cor.Y
# Check impact of cv
# rpart is not a good candidate since caret does not optimize cp (only tuning parameter of rpart) well
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Max.cor.Y.rcv.*X*"), major.inc = FALSE,
label.minor = "glmnet")
## label step_major step_minor label_minor
## 3 fit.models_0_Random 1 2 myrandom_classfr
## 4 fit.models_0_Max.cor.Y.rcv.*X* 1 3 glmnet
## bgn end elapsed
## 3 42.229 94.018 51.789
## 4 94.018 NA NA
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.rcv.1X1", type = glb_model_type, trainControl.method = "none",
train.method = "glmnet")),
indepVar = max_cor_y_x_vars, rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
## [1] "myfit_mdl: enter: 0.001000 secs"
## [1] "myfit_mdl: fitting model: Max.cor.Y.rcv.1X1###glmnet"
## [1] " indepVar: Datereceived.last16.log1p,Datereceived.last8.log1p"
## [1] "myfit_mdl: setup complete: 0.563000 secs"
## Loading required package: glmnet
## Loading required package: Matrix
## Loaded glmnet 2.0-5
## Fitting alpha = 0.1, lambda = 0.000666 on full training set
## [1] "myfit_mdl: train complete: 1.342000 secs"
## alpha lambda
## 1 0.1 0.0006662283
## Length Class Mode
## a0 35 -none- numeric
## beta 70 dgCMatrix S4
## df 35 -none- numeric
## dim 2 -none- numeric
## lambda 35 -none- numeric
## dev.ratio 35 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 2 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept) Datereceived.last16.log1p
## -1.42718726 0.02019617
## Datereceived.last8.log1p
## 0.01846209
## [1] "max lambda < lambdaOpt:"
## [1] "Feats mismatch between coefs_left & rght:"
## [1] "(Intercept)" "Datereceived.last16.log1p"
## [3] "Datereceived.last8.log1p"
## [1] "myfit_mdl: train diagnostics complete: 1.394000 secs"
## [1] "mypredict_mdl: maxMetricDf:"
## threshold f.score Accuracy g.score
## 7 0.30 0 0.7743332 0
## 8 0.35 0 0.7743332 0
## 9 0.40 0 0.7743332 0
## 10 0.45 0 0.7743332 0
## 11 0.50 0 0.7743332 0
## 12 0.55 0 0.7743332 0
## 13 0.60 0 0.7743332 0
## 14 0.65 0 0.7743332 0
## 15 0.70 0 0.7743332 0
## 16 0.75 0 0.7743332 0
## 17 0.80 0 0.7743332 0
## 18 0.85 0 0.7743332 0
## 19 0.90 0 0.7743332 0
## 20 0.95 0 0.7743332 0
## 21 1.00 0 0.7743332 0
## Prediction
## Reference N Y
## N 26476 0
## Y 7716 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7743332 0.0000000 0.7698645 0.7787552 0.7743332
## AccuracyPValue McnemarPValue
## 0.5030525 0.0000000
## [1] "mypredict_mdl: maxMetricDf:"
## threshold f.score Accuracy g.score
## 7 0.30 0 0.7743332 0
## 8 0.35 0 0.7743332 0
## 9 0.40 0 0.7743332 0
## 10 0.45 0 0.7743332 0
## 11 0.50 0 0.7743332 0
## 12 0.55 0 0.7743332 0
## 13 0.60 0 0.7743332 0
## 14 0.65 0 0.7743332 0
## 15 0.70 0 0.7743332 0
## 16 0.75 0 0.7743332 0
## 17 0.80 0 0.7743332 0
## 18 0.85 0 0.7743332 0
## 19 0.90 0 0.7743332 0
## 20 0.95 0 0.7743332 0
## 21 1.00 0 0.7743332 0
## Prediction
## Reference N Y
## N 6619 0
## Y 1929 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7743332 0.0000000 0.7653204 0.7831586 0.7743332
## AccuracyPValue McnemarPValue
## 0.5061047 0.0000000
## [1] "myfit_mdl: predict complete: 19.388000 secs"
## id
## 1 Max.cor.Y.rcv.1X1###glmnet
## feats max.nTuningRuns
## 1 Datereceived.last16.log1p,Datereceived.last8.log1p 0
## min.elapsedtime.everything min.elapsedtime.final max.AUCpROC.fit
## 1 0.752 0.171 0.5
## max.Sens.fit max.Spec.fit max.AUCROCR.fit opt.prob.threshold.fit
## 1 1 0 0.5540349 0.5
## max.f.score.fit max.Accuracy.fit max.AccuracyLower.fit
## 1 0 0.7743332 0.7698645
## max.AccuracyUpper.fit max.Kappa.fit min.log.loss.mean.fit
## 1 0.7787552 0 0.5301989
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5 1 0 0.5520402
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.5 0 0.7743332
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7653204 0.7831586 0
## min.log.loss.mean.OOB
## 1 0.5302662
## [1] "myfit_mdl: exit: 19.689000 secs"
if (glbMdlCheckRcv) {
# rcv_n_folds == 1 & rcv_n_repeats > 1 crashes
for (rcv_n_folds in seq(3, glb_rcv_n_folds + 2, 2))
for (rcv_n_repeats in seq(1, glb_rcv_n_repeats + 2, 2)) {
# Experiment specific code to avoid caret crash
# lcl_tune_models_df <- rbind(data.frame()
# ,data.frame(method = "glmnet", parameter = "alpha",
# vals = "0.100 0.325 0.550 0.775 1.000")
# ,data.frame(method = "glmnet", parameter = "lambda",
# vals = "9.342e-02")
# )
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst =
list(
id.prefix = paste0("Max.cor.Y.rcv.", rcv_n_folds, "X", rcv_n_repeats),
type = glb_model_type,
# tune.df = lcl_tune_models_df,
trainControl.method = "repeatedcv",
trainControl.number = rcv_n_folds,
trainControl.repeats = rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.method = "glmnet", train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize)),
indepVar = max_cor_y_x_vars, rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
# Add parallel coordinates graph of glb_models_df[, glbMdlMetricsEval] to evaluate cv parameters
tmp_models_cols <- c("id", "max.nTuningRuns",
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)],
grep("opt.", names(glb_models_df), fixed = TRUE, value = TRUE))
print(myplot_parcoord(obs_df = subset(glb_models_df,
grepl("Max.cor.Y.rcv.", id, fixed = TRUE),
select = -feats)[, tmp_models_cols],
id_var = "id"))
}
# Useful for stacking decisions
# fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
# paste0("fit.models_0_", "Max.cor.Y[rcv.1X1.cp.0|]"), major.inc = FALSE,
# label.minor = "rpart")
#
# ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
# id.prefix = "Max.cor.Y.rcv.1X1.cp.0", type = glb_model_type, trainControl.method = "none",
# train.method = "rpart",
# tune.df=data.frame(method="rpart", parameter="cp", min=0.0, max=0.0, by=0.1))),
# indepVar=max_cor_y_x_vars, rsp_var=glb_rsp_var,
# fit_df=glbObsFit, OOB_df=glbObsOOB)
#stop(here"); glb2Sav(); all.equal(glb_models_df, sav_models_df)
# if (glb_is_regression || glb_is_binomial) # For multinomials this model will be run next by default
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y",
type = glb_model_type, trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.blockParallel = glbMdlSequential,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "rpart")),
indepVar = max_cor_y_x_vars, rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
## [1] "myfit_mdl: enter: 0.000000 secs"
## [1] "myfit_mdl: fitting model: Max.cor.Y##rcv#rpart"
## [1] " indepVar: Datereceived.last16.log1p,Datereceived.last8.log1p"
## [1] "myfit_mdl: setup complete: 0.573000 secs"
## Loading required package: rpart
## Aggregating results
## Fitting final model on full training set
## [1] "myfit_mdl: train complete: 2.978000 secs"
## cp
## 1 0
## Loading required package: rpart.plot
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 34192
##
## CP nsplit rel error
## 1 0 0 1
##
## Node number 1: 34192 observations
## predicted class=N expected loss=0.2256668 P(node) =1
## class counts: 26476 7716
## probabilities: 0.774 0.226
##
## n= 34192
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 34192 7716 N (0.7743332 0.2256668) *
## [1] "myfit_mdl: train diagnostics complete: 3.061000 secs"
## [1] "mypredict_mdl: maxMetricDf:"
## threshold f.score Accuracy g.score
## 6 0.25 0 0.7743332 0
## 7 0.30 0 0.7743332 0
## 8 0.35 0 0.7743332 0
## 9 0.40 0 0.7743332 0
## 10 0.45 0 0.7743332 0
## 11 0.50 0 0.7743332 0
## 12 0.55 0 0.7743332 0
## 13 0.60 0 0.7743332 0
## 14 0.65 0 0.7743332 0
## 15 0.70 0 0.7743332 0
## 16 0.75 0 0.7743332 0
## 17 0.80 0 0.7743332 0
## 18 0.85 0 0.7743332 0
## 19 0.90 0 0.7743332 0
## 20 0.95 0 0.7743332 0
## 21 1.00 0 0.7743332 0
## Prediction
## Reference N Y
## N 26476 0
## Y 7716 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7743332 0.0000000 0.7698645 0.7787552 0.7743332
## AccuracyPValue McnemarPValue
## 0.5030525 0.0000000
## [1] "mypredict_mdl: maxMetricDf:"
## threshold f.score Accuracy g.score
## 6 0.25 0 0.7743332 0
## 7 0.30 0 0.7743332 0
## 8 0.35 0 0.7743332 0
## 9 0.40 0 0.7743332 0
## 10 0.45 0 0.7743332 0
## 11 0.50 0 0.7743332 0
## 12 0.55 0 0.7743332 0
## 13 0.60 0 0.7743332 0
## 14 0.65 0 0.7743332 0
## 15 0.70 0 0.7743332 0
## 16 0.75 0 0.7743332 0
## 17 0.80 0 0.7743332 0
## 18 0.85 0 0.7743332 0
## 19 0.90 0 0.7743332 0
## 20 0.95 0 0.7743332 0
## 21 1.00 0 0.7743332 0
## Prediction
## Reference N Y
## N 6619 0
## Y 1929 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7743332 0.0000000 0.7653204 0.7831586 0.7743332
## AccuracyPValue McnemarPValue
## 0.5061047 0.0000000
## [1] "myfit_mdl: predict complete: 20.569000 secs"
## id feats
## 1 Max.cor.Y##rcv#rpart Datereceived.last16.log1p,Datereceived.last8.log1p
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 2.376 0.051
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5 1 0 0.5
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0 0.7743332
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7698645 0.7787552 0
## min.log.loss.mean.fit max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB
## 1 0.5339873 0.5 1 0
## max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.5 0.5 0 0.7743332
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7653204 0.7831586 0
## min.log.loss.mean.OOB max.AccuracySD.fit max.KappaSD.fit
## 1 0.5339873 6.863966e-05 0
## [1] "myfit_mdl: exit: 20.879000 secs"
if ((length(glbFeatsDateTime) > 0) &&
(sum(grepl(paste(names(glbFeatsDateTime), "\\.day\\.minutes\\.poly\\.", sep = ""),
names(glbObsAll))) > 0)) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Max.cor.Y.Time.Poly"), major.inc = FALSE,
label.minor = "glmnet")
indepVars <- c(max_cor_y_x_vars,
grep(paste(names(glbFeatsDateTime), "\\.day\\.minutes\\.poly\\.", sep = ""),
names(glbObsAll), value = TRUE))
indepVars <- myadjustInteractionFeats(glb_feats_df, indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Time.Poly",
type = glb_model_type, trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.blockParallel = glbMdlSequential,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indepVar = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
## Warning in grepl(paste(names(glbFeatsDateTime), "\\.day\\.minutes\\.poly\
## \.", : argument 'pattern' has length > 1 and only the first element will be
## used
if ((length(glbFeatsDateTime) > 0) &&
(sum(grepl(paste(names(glbFeatsDateTime), "\\.last[[:digit:]]", sep = ""),
names(glbObsAll))) > 0)) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Max.cor.Y.Time.Lag"), major.inc = FALSE,
label.minor = "glmnet")
indepVars <- c(max_cor_y_x_vars,
grep(paste(names(glbFeatsDateTime), "\\.last[[:digit:]]", sep = ""),
names(glbObsAll), value = TRUE))
indepVars <- myadjustInteractionFeats(glb_feats_df, indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Time.Lag",
type = glb_model_type,
tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.blockParallel = glbMdlSequential,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indepVar = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
## Warning in grepl(paste(names(glbFeatsDateTime), "\\.last[[:digit:]]", sep =
## ""), : argument 'pattern' has length > 1 and only the first element will be
## used
## label step_major step_minor label_minor
## 4 fit.models_0_Max.cor.Y.rcv.*X* 1 3 glmnet
## 5 fit.models_0_Max.cor.Y.Time.Lag 1 4 glmnet
## bgn end elapsed
## 4 94.018 134.612 40.595
## 5 134.613 NA NA
## Warning in grep(paste(names(glbFeatsDateTime), "\\.last[[:digit:]]", sep =
## ""), : argument 'pattern' has length > 1 and only the first element will be
## used
## [1] "myfit_mdl: enter: 0.000000 secs"
## [1] "myfit_mdl: fitting model: Max.cor.Y.Time.Lag##rcv#glmnet"
## [1] " indepVar: Datereceived.last16.log1p,Datereceived.last8.log1p,Datereceived.last2.log1p,Datereceived.last4.log1p,Datereceived.last8.log1p,Datereceived.last16.log1p,Datereceived.last32.log1p"
## [1] "myfit_mdl: setup complete: 0.572000 secs"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0144 on full training set
## [1] "myfit_mdl: train complete: 10.143000 secs"
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Max.cor.Y.Time.Lag", : model's bestTune found at an
## extreme of tuneGrid for parameter: alpha
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Max.cor.Y.Time.Lag", : model's bestTune found at an
## extreme of tuneGrid for parameter: lambda
## Length Class Mode
## a0 35 -none- numeric
## beta 175 dgCMatrix S4
## df 35 -none- numeric
## dim 2 -none- numeric
## lambda 35 -none- numeric
## dev.ratio 35 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 5 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept) Datereceived.last16.log1p
## -1.495163848 0.016472879
## Datereceived.last2.log1p Datereceived.last32.log1p
## 0.003614596 0.009519897
## Datereceived.last4.log1p Datereceived.last8.log1p
## 0.003352991 0.015522667
## [1] "max lambda < lambdaOpt:"
## (Intercept) Datereceived.last16.log1p
## -1.497663494 0.016606787
## Datereceived.last2.log1p Datereceived.last32.log1p
## 0.003730380 0.009637512
## Datereceived.last4.log1p Datereceived.last8.log1p
## 0.003298760 0.015639091
## [1] "myfit_mdl: train diagnostics complete: 10.474000 secs"
## [1] "mypredict_mdl: maxMetricDf:"
## threshold f.score Accuracy g.score
## 7 0.30 0 0.7743332 0
## 8 0.35 0 0.7743332 0
## 9 0.40 0 0.7743332 0
## 10 0.45 0 0.7743332 0
## 11 0.50 0 0.7743332 0
## 12 0.55 0 0.7743332 0
## 13 0.60 0 0.7743332 0
## 14 0.65 0 0.7743332 0
## 15 0.70 0 0.7743332 0
## 16 0.75 0 0.7743332 0
## 17 0.80 0 0.7743332 0
## 18 0.85 0 0.7743332 0
## 19 0.90 0 0.7743332 0
## 20 0.95 0 0.7743332 0
## 21 1.00 0 0.7743332 0
## Prediction
## Reference N Y
## N 26476 0
## Y 7716 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7743332 0.0000000 0.7698645 0.7787552 0.7743332
## AccuracyPValue McnemarPValue
## 0.5030525 0.0000000
## [1] "mypredict_mdl: maxMetricDf:"
## threshold f.score Accuracy g.score
## 7 0.30 0 0.7743332 0
## 8 0.35 0 0.7743332 0
## 9 0.40 0 0.7743332 0
## 10 0.45 0 0.7743332 0
## 11 0.50 0 0.7743332 0
## 12 0.55 0 0.7743332 0
## 13 0.60 0 0.7743332 0
## 14 0.65 0 0.7743332 0
## 15 0.70 0 0.7743332 0
## 16 0.75 0 0.7743332 0
## 17 0.80 0 0.7743332 0
## 18 0.85 0 0.7743332 0
## 19 0.90 0 0.7743332 0
## 20 0.95 0 0.7743332 0
## 21 1.00 0 0.7743332 0
## Prediction
## Reference N Y
## N 6619 0
## Y 1929 0
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7743332 0.0000000 0.7653204 0.7831586 0.7743332
## AccuracyPValue McnemarPValue
## 0.5061047 0.0000000
## [1] "myfit_mdl: predict complete: 29.134000 secs"
## id
## 1 Max.cor.Y.Time.Lag##rcv#glmnet
## feats
## 1 Datereceived.last16.log1p,Datereceived.last8.log1p,Datereceived.last2.log1p,Datereceived.last4.log1p,Datereceived.last8.log1p,Datereceived.last16.log1p,Datereceived.last32.log1p
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 15 9.539 0.217
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5 1 0 0.5585689
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0 0.7743332
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7698645 0.7787552 0
## min.log.loss.mean.fit max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB
## 1 0.5300274 0.5 1 0
## max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.5583842 0.5 0 0.7743332
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7653204 0.7831586 0
## min.log.loss.mean.OOB max.AccuracySD.fit max.KappaSD.fit
## 1 0.530128 6.9999e-05 0
## [1] "myfit_mdl: exit: 29.447000 secs"
if (length(glbFeatsText) > 0) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Txt.*"), major.inc = FALSE,
label.minor = "glmnet")
indepVars <- c(max_cor_y_x_vars)
for (txtFeat in names(glbFeatsText))
indepVars <- union(indepVars,
grep(paste(str_to_upper(substr(txtFeat, 1, 1)), "\\.(?!([T|P]\\.))", sep = ""),
names(glbObsAll), perl = TRUE, value = TRUE))
indepVars <- myadjustInteractionFeats(glb_feats_df, indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Text.nonTP",
type = glb_model_type,
tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.blockParallel = glbMdlSequential,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indepVar = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
indepVars <- c(max_cor_y_x_vars)
for (txtFeat in names(glbFeatsText))
indepVars <- union(indepVars,
grep(paste(str_to_upper(substr(txtFeat, 1, 1)), "\\.T\\.", sep = ""),
names(glbObsAll), perl = TRUE, value = TRUE))
indepVars <- myadjustInteractionFeats(glb_feats_df, indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Text.onlyT",
type = glb_model_type,
tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indepVar = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
indepVars <- c(max_cor_y_x_vars)
for (txtFeat in names(glbFeatsText))
indepVars <- union(indepVars,
grep(paste(str_to_upper(substr(txtFeat, 1, 1)), "\\.P\\.", sep = ""),
names(glbObsAll), perl = TRUE, value = TRUE))
indepVars <- myadjustInteractionFeats(glb_feats_df, indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Text.onlyP",
type = glb_model_type,
tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.blockParallel = glbMdlSequential,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indepVar = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
# Interactions.High.cor.Y
if (length(int_feats <- setdiff(setdiff(unique(glb_feats_df$cor.high.X), NA),
subset(glb_feats_df, nzv)$id)) > 0) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Interact.High.cor.Y"), major.inc = FALSE,
label.minor = "glmnet")
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Interact.High.cor.Y",
type=glb_model_type, trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds, trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.blockParallel = glbMdlSequential,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method="glmnet")),
indepVar=c(max_cor_y_x_vars, paste(max_cor_y_x_vars[1], int_feats, sep=":")),
rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
}
## label step_major step_minor label_minor
## 5 fit.models_0_Max.cor.Y.Time.Lag 1 4 glmnet
## 6 fit.models_0_Interact.High.cor.Y 1 5 glmnet
## bgn end elapsed
## 5 134.613 164.074 29.462
## 6 164.075 NA NA
## [1] "myfit_mdl: enter: 0.001000 secs"
## [1] "myfit_mdl: fitting model: Interact.High.cor.Y##rcv#glmnet"
## [1] " indepVar: Datereceived.last16.log1p,Datereceived.last8.log1p,Datereceived.last16.log1p:Datereceived.last16.log1p,Datereceived.last16.log1p:Datereceived.last8.log1p,Datereceived.last16.log1p:Datereceived.last4.log1p,Datereceived.last16.log1p:Datereceived.last32.log1p,Datereceived.last16.log1p:Datereceived.last2.log1p,Datereceived.last16.log1p:ComplaintID,Datereceived.last16.log1p:Datereceived.year.fctr,Datereceived.last16.log1p:Datereceived.juliandate,Datereceived.last16.log1p:Datereceived.month.fctr,Datereceived.last16.log1p:Datereceived.wkend,Datereceived.last16.log1p:Sent.Recd.Dys.log1p,Datereceived.last16.log1p:Datereceived.date.fctr,Datereceived.last16.log1p:Datereceived.wkday.fctr"
## [1] "myfit_mdl: setup complete: 0.565000 secs"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.55, lambda = 0.00029 on full training set
## [1] "myfit_mdl: train complete: 62.313000 secs"
## Length Class Mode
## a0 63 -none- numeric
## beta 2205 dgCMatrix S4
## df 63 -none- numeric
## dim 2 -none- numeric
## lambda 63 -none- numeric
## dev.ratio 63 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 35 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -1.450278e+00
## Datereceived.last16.log1p
## -1.245579e-03
## Datereceived.last8.log1p
## 1.807183e-02
## Datereceived.last16.log1p:ComplaintID
## 2.092372e-08
## Datereceived.last16.log1p:Datereceived.date.fctr(7,13]
## 4.847783e-03
## Datereceived.last16.log1p:Datereceived.date.fctr(13,19]
## 3.796691e-03
## Datereceived.last16.log1p:Datereceived.date.fctr(19,25]
## 6.234218e-03
## Datereceived.last16.log1p:Datereceived.date.fctr(25,31]
## 1.008918e-02
## Datereceived.last16.log1p:Datereceived.juliandate
## 9.000222e-05
## Datereceived.last16.log1p:Datereceived.last2.log1p
## 4.884210e-04
## Datereceived.last16.log1p:Datereceived.last32.log1p
## -2.710306e-03
## Datereceived.last16.log1p:Datereceived.last4.log1p
## 3.677795e-04
## Datereceived.last16.log1p:Datereceived.month.fctr02
## -1.065609e-02
## Datereceived.last16.log1p:Datereceived.month.fctr03
## 2.298617e-03
## Datereceived.last16.log1p:Datereceived.month.fctr05
## 1.731073e-02
## Datereceived.last16.log1p:Datereceived.month.fctr06
## 2.095172e-02
## Datereceived.last16.log1p:Datereceived.month.fctr07
## 2.876470e-02
## Datereceived.last16.log1p:Datereceived.month.fctr08
## 4.378032e-02
## Datereceived.last16.log1p:Datereceived.month.fctr09
## 3.354271e-02
## Datereceived.last16.log1p:Datereceived.month.fctr10
## 5.971978e-03
## Datereceived.last16.log1p:Datereceived.month.fctr11
## -3.011589e-03
## Datereceived.last16.log1p:Datereceived.month.fctr12
## -1.672570e-02
## Datereceived.last16.log1p:Datereceived.wkday.fctr1
## 1.898352e-03
## Datereceived.last16.log1p:Datereceived.wkday.fctr2
## 1.320978e-02
## Datereceived.last16.log1p:Datereceived.wkday.fctr4
## -3.899902e-03
## Datereceived.last16.log1p:Datereceived.wkday.fctr5
## -2.691646e-03
## Datereceived.last16.log1p:Datereceived.wkday.fctr6
## 9.015971e-04
## Datereceived.last16.log1p:Datereceived.wkend
## -1.388963e-02
## Datereceived.last16.log1p:Datereceived.year.fctr2013
## -3.138872e-04
## Datereceived.last16.log1p:Datereceived.year.fctr2014
## -1.786994e-02
## Datereceived.last16.log1p:Datereceived.year.fctr2015
## -1.938140e-02
## Datereceived.last16.log1p:Datereceived.year.fctr2016
## 6.192826e-02
## Datereceived.last16.log1p:Sent.Recd.Dys.log1p
## -2.009394e-03
## [1] "max lambda < lambdaOpt:"
## [1] "Feats mismatch between coefs_left & rght:"
## [1] "(Intercept)"
## [2] "Datereceived.last16.log1p"
## [3] "Datereceived.last8.log1p"
## [4] "Datereceived.last16.log1p:ComplaintID"
## [5] "Datereceived.last16.log1p:Datereceived.date.fctr(7,13]"
## [6] "Datereceived.last16.log1p:Datereceived.date.fctr(13,19]"
## [7] "Datereceived.last16.log1p:Datereceived.date.fctr(19,25]"
## [8] "Datereceived.last16.log1p:Datereceived.date.fctr(25,31]"
## [9] "Datereceived.last16.log1p:Datereceived.juliandate"
## [10] "Datereceived.last16.log1p:Datereceived.last2.log1p"
## [11] "Datereceived.last16.log1p:Datereceived.last32.log1p"
## [12] "Datereceived.last16.log1p:Datereceived.last4.log1p"
## [13] "Datereceived.last16.log1p:Datereceived.last8.log1p"
## [14] "Datereceived.last16.log1p:Datereceived.month.fctr02"
## [15] "Datereceived.last16.log1p:Datereceived.month.fctr03"
## [16] "Datereceived.last16.log1p:Datereceived.month.fctr04"
## [17] "Datereceived.last16.log1p:Datereceived.month.fctr05"
## [18] "Datereceived.last16.log1p:Datereceived.month.fctr06"
## [19] "Datereceived.last16.log1p:Datereceived.month.fctr07"
## [20] "Datereceived.last16.log1p:Datereceived.month.fctr08"
## [21] "Datereceived.last16.log1p:Datereceived.month.fctr09"
## [22] "Datereceived.last16.log1p:Datereceived.month.fctr10"
## [23] "Datereceived.last16.log1p:Datereceived.month.fctr11"
## [24] "Datereceived.last16.log1p:Datereceived.month.fctr12"
## [25] "Datereceived.last16.log1p:Datereceived.wkday.fctr1"
## [26] "Datereceived.last16.log1p:Datereceived.wkday.fctr2"
## [27] "Datereceived.last16.log1p:Datereceived.wkday.fctr3"
## [28] "Datereceived.last16.log1p:Datereceived.wkday.fctr4"
## [29] "Datereceived.last16.log1p:Datereceived.wkday.fctr5"
## [30] "Datereceived.last16.log1p:Datereceived.wkday.fctr6"
## [31] "Datereceived.last16.log1p:Datereceived.wkend"
## [32] "Datereceived.last16.log1p:Datereceived.year.fctr2013"
## [33] "Datereceived.last16.log1p:Datereceived.year.fctr2014"
## [34] "Datereceived.last16.log1p:Datereceived.year.fctr2015"
## [35] "Datereceived.last16.log1p:Datereceived.year.fctr2016"
## [36] "Datereceived.last16.log1p:Sent.Recd.Dys.log1p"
## [1] "myfit_mdl: train diagnostics complete: 62.666000 secs"
## Prediction
## Reference N Y
## N 26402 74
## Y 7020 696
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.925246e-01 1.283377e-01 7.881871e-01 7.968123e-01 7.743332e-01
## AccuracyPValue McnemarPValue
## 2.351057e-16 0.000000e+00
## Prediction
## Reference N Y
## N 6597 22
## Y 1750 179
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.927000e-01 1.310667e-01 7.839490e-01 8.012512e-01 7.743332e-01
## AccuracyPValue McnemarPValue
## 2.181593e-05 0.000000e+00
## [1] "myfit_mdl: predict complete: 90.174000 secs"
## id
## 1 Interact.High.cor.Y##rcv#glmnet
## feats
## 1 Datereceived.last16.log1p,Datereceived.last8.log1p,Datereceived.last16.log1p:Datereceived.last16.log1p,Datereceived.last16.log1p:Datereceived.last8.log1p,Datereceived.last16.log1p:Datereceived.last4.log1p,Datereceived.last16.log1p:Datereceived.last32.log1p,Datereceived.last16.log1p:Datereceived.last2.log1p,Datereceived.last16.log1p:ComplaintID,Datereceived.last16.log1p:Datereceived.year.fctr,Datereceived.last16.log1p:Datereceived.juliandate,Datereceived.last16.log1p:Datereceived.month.fctr,Datereceived.last16.log1p:Datereceived.wkend,Datereceived.last16.log1p:Sent.Recd.Dys.log1p,Datereceived.last16.log1p:Datereceived.date.fctr,Datereceived.last16.log1p:Datereceived.wkday.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 61.686 1.735
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5437036 0.997205 0.09020218 0.57376
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.1640349 0.7919981
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7881871 0.7968123 0.1257717
## min.log.loss.mean.fit max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB
## 1 0.5171275 0.5447352 0.9966762 0.09279419
## max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.577183 0.5 0.1680751 0.7927
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.783949 0.8012512 0.1310667
## min.log.loss.mean.OOB max.AccuracySD.fit max.KappaSD.fit
## 1 0.5171602 0.001751503 0.01050938
## [1] "myfit_mdl: exit: 90.577000 secs"
# Low.cor.X
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Low.cor.X"), major.inc = FALSE,
label.minor = "glmnet")
## label step_major step_minor label_minor
## 6 fit.models_0_Interact.High.cor.Y 1 5 glmnet
## 7 fit.models_0_Low.cor.X 1 6 glmnet
## bgn end elapsed
## 6 164.075 254.719 90.644
## 7 254.719 NA NA
indepVar <- mygetIndepVar(glb_feats_df)
indepVar <- setdiff(indepVar, unique(glb_feats_df$cor.high.X))
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Low.cor.X",
type = glb_model_type,
tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.blockParallel = glbMdlSequential,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indepVar = indepVar, rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
## [1] "myfit_mdl: enter: 0.000000 secs"
## [1] "myfit_mdl: fitting model: Low.cor.X##rcv#glmnet"
## [1] " indepVar: Datesenttocompany.last16.log1p,Datesenttocompany.last8.log1p,Datesenttocompany.last4.log1p,Datesenttocompany.last32.log1p,Datesenttocompany.last2.log1p,spcPrd.fctr,gCConsent.fctr,Datesenttocompany.year.fctr,Datesenttocompany.juliandate,Datesenttocompany.month.fctr,.pos,Datesenttocompany.wkend,Sent.Recd.Dys.nexp,Datesenttocompany.date.fctr,.rnorm,Rgn.Dvn.fctr,gTags.fctr,Datesenttocompany.wkday.fctr,Sent.Recd.Dys,Sent.Recd.Dys.root2,Channel.fctr,Response.fctr"
## [1] "myfit_mdl: setup complete: 0.563000 secs"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 1, lambda = 0.000144 on full training set
## [1] "myfit_mdl: train complete: 82.158000 secs"
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Low.cor.X", : model's bestTune found at an extreme of
## tuneGrid for parameter: alpha
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Low.cor.X", : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 56 -none- numeric
## beta 3976 dgCMatrix S4
## df 56 -none- numeric
## dim 2 -none- numeric
## lambda 56 -none- numeric
## dev.ratio 56 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 71 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -2.833174e+00
## .pos
## 1.783078e-06
## .rnorm
## 1.344857e-02
## Channel.fctrEmail
## 1.876600e-01
## Channel.fctrFax
## 3.096568e-01
## Channel.fctrPhone
## -3.312246e-01
## Channel.fctrPostalmail
## -2.851531e-01
## Channel.fctrReferral
## -2.787625e-01
## Datesenttocompany.date.fctr(7,13]
## 4.382086e-03
## Datesenttocompany.date.fctr(13,19]
## 4.385329e-02
## Datesenttocompany.date.fctr(19,25]
## 1.795005e-02
## Datesenttocompany.date.fctr(25,31]
## 5.382646e-02
## Datesenttocompany.juliandate
## 5.072161e-04
## Datesenttocompany.last16.log1p
## 3.821318e-02
## Datesenttocompany.last2.log1p
## 1.005456e-02
## Datesenttocompany.last32.log1p
## 3.790284e-02
## Datesenttocompany.last4.log1p
## 1.229829e-02
## Datesenttocompany.last8.log1p
## 3.112155e-02
## Datesenttocompany.month.fctr03
## 9.397795e-02
## Datesenttocompany.month.fctr04
## -1.168159e-02
## Datesenttocompany.month.fctr05
## -9.611195e-02
## Datesenttocompany.month.fctr06
## 2.066597e-02
## Datesenttocompany.month.fctr07
## 7.760951e-02
## Datesenttocompany.month.fctr08
## 3.869755e-01
## Datesenttocompany.month.fctr09
## 2.592862e-01
## Datesenttocompany.month.fctr10
## 8.413044e-02
## Datesenttocompany.month.fctr12
## -9.193724e-02
## Datesenttocompany.wkday.fctr1
## -2.317356e-02
## Datesenttocompany.wkday.fctr2
## 9.613791e-02
## Datesenttocompany.wkday.fctr3
## 1.638049e-02
## Datesenttocompany.wkday.fctr5
## -6.542013e-02
## Datesenttocompany.wkday.fctr6
## 7.472078e-02
## Datesenttocompany.wkend
## -4.190717e-01
## Datesenttocompany.year.fctr2013
## 1.158689e-01
## Datesenttocompany.year.fctr2014
## 2.268959e-01
## Datesenttocompany.year.fctr2015
## 4.625708e-01
## Datesenttocompany.year.fctr2016
## 4.079339e-01
## Response.fctrClosed
## 1.432784e-02
## Response.fctrClosedwithmonetaryrelief
## -7.320088e-01
## Response.fctrClosedwithnon-monetaryrelief
## -6.679661e-01
## Response.fctrClosedwithoutrelief
## 1.058606e-01
## Response.fctrClosedwithrelief
## -3.791354e-01
## Rgn.Dvn.fctrMW#EastNorthCentral
## -5.425376e-02
## Rgn.Dvn.fctrMW#WestNorthCentral
## -1.421344e-01
## Rgn.Dvn.fctrMW#WestNorthCentral#MidAtlantic
## -3.705110e-02
## Rgn.Dvn.fctrMW#WestNorthCentral#NewEngland
## 8.816930e-03
## Rgn.Dvn.fctrOT#Other
## 1.117432e-01
## Rgn.Dvn.fctrSH#EastSouthCentral
## -8.728218e-02
## Rgn.Dvn.fctrSH#WestSouthCentral
## -6.817723e-02
## Rgn.Dvn.fctrWT#Mountain
## 3.002460e-02
## Rgn.Dvn.fctrWT#Pacific
## 2.195874e-02
## Sent.Recd.Dys
## -1.390798e-03
## Sent.Recd.Dys.nexp
## -1.271262e-02
## Sent.Recd.Dys.root2
## -1.790608e-02
## gCConsent.fctrConsentnotprovided
## -2.387427e-01
## gCConsent.fctrConsentprovided
## -4.673595e-02
## gCConsent.fctrOther
## -1.619361e-01
## gTags.fctrOlderAmerican
## 1.297520e-01
## gTags.fctrServicemember
## -9.518738e-02
## spcPrd.fctrConsumerLoan#Installmentloan
## 9.339089e-02
## spcPrd.fctrConsumerLoan#Pawnloan
## 3.067649e-01
## spcPrd.fctrConsumerLoan#Personallineofcredit
## 1.402139e-01
## spcPrd.fctrConsumerLoan#Vehiclelease
## 2.998269e-01
## spcPrd.fctrConsumerLoan#Vehicleloan
## 1.349598e-01
## spcPrd.fctrStudentloan#Federalstudentloanservicing
## 2.471046e-01
## [1] "max lambda < lambdaOpt:"
## [1] "Feats mismatch between coefs_left & rght:"
## [1] "(Intercept)"
## [2] ".pos"
## [3] ".rnorm"
## [4] "Channel.fctrEmail"
## [5] "Channel.fctrFax"
## [6] "Channel.fctrPhone"
## [7] "Channel.fctrPostalmail"
## [8] "Channel.fctrReferral"
## [9] "Datesenttocompany.date.fctr(7,13]"
## [10] "Datesenttocompany.date.fctr(13,19]"
## [11] "Datesenttocompany.date.fctr(19,25]"
## [12] "Datesenttocompany.date.fctr(25,31]"
## [13] "Datesenttocompany.juliandate"
## [14] "Datesenttocompany.last16.log1p"
## [15] "Datesenttocompany.last2.log1p"
## [16] "Datesenttocompany.last32.log1p"
## [17] "Datesenttocompany.last4.log1p"
## [18] "Datesenttocompany.last8.log1p"
## [19] "Datesenttocompany.month.fctr02"
## [20] "Datesenttocompany.month.fctr03"
## [21] "Datesenttocompany.month.fctr04"
## [22] "Datesenttocompany.month.fctr05"
## [23] "Datesenttocompany.month.fctr06"
## [24] "Datesenttocompany.month.fctr07"
## [25] "Datesenttocompany.month.fctr08"
## [26] "Datesenttocompany.month.fctr09"
## [27] "Datesenttocompany.month.fctr10"
## [28] "Datesenttocompany.month.fctr11"
## [29] "Datesenttocompany.month.fctr12"
## [30] "Datesenttocompany.wkday.fctr1"
## [31] "Datesenttocompany.wkday.fctr2"
## [32] "Datesenttocompany.wkday.fctr3"
## [33] "Datesenttocompany.wkday.fctr4"
## [34] "Datesenttocompany.wkday.fctr5"
## [35] "Datesenttocompany.wkday.fctr6"
## [36] "Datesenttocompany.wkend"
## [37] "Datesenttocompany.year.fctr2013"
## [38] "Datesenttocompany.year.fctr2014"
## [39] "Datesenttocompany.year.fctr2015"
## [40] "Datesenttocompany.year.fctr2016"
## [41] "Response.fctrClosed"
## [42] "Response.fctrClosedwithmonetaryrelief"
## [43] "Response.fctrClosedwithnon-monetaryrelief"
## [44] "Response.fctrClosedwithoutrelief"
## [45] "Response.fctrClosedwithrelief"
## [46] "Response.fctrInprogress"
## [47] "Response.fctrUntimelyresponse"
## [48] "Rgn.Dvn.fctrMW#EastNorthCentral"
## [49] "Rgn.Dvn.fctrMW#WestNorthCentral"
## [50] "Rgn.Dvn.fctrMW#WestNorthCentral#MidAtlantic"
## [51] "Rgn.Dvn.fctrMW#WestNorthCentral#NewEngland"
## [52] "Rgn.Dvn.fctrOT#Other"
## [53] "Rgn.Dvn.fctrSH#EastSouthCentral"
## [54] "Rgn.Dvn.fctrSH#WestSouthCentral"
## [55] "Rgn.Dvn.fctrWT#Mountain"
## [56] "Rgn.Dvn.fctrWT#Pacific"
## [57] "Sent.Recd.Dys"
## [58] "Sent.Recd.Dys.nexp"
## [59] "Sent.Recd.Dys.root2"
## [60] "gCConsent.fctrConsentnotprovided"
## [61] "gCConsent.fctrConsentprovided"
## [62] "gCConsent.fctrOther"
## [63] "gTags.fctrOlderAmerican"
## [64] "gTags.fctrOlderAmericanServicemember"
## [65] "gTags.fctrServicemember"
## [66] "spcPrd.fctrConsumerLoan#Installmentloan"
## [67] "spcPrd.fctrConsumerLoan#Pawnloan"
## [68] "spcPrd.fctrConsumerLoan#Personallineofcredit"
## [69] "spcPrd.fctrConsumerLoan#Titleloan"
## [70] "spcPrd.fctrConsumerLoan#Vehiclelease"
## [71] "spcPrd.fctrConsumerLoan#Vehicleloan"
## [72] "spcPrd.fctrStudentloan#Federalstudentloanservicing"
## [1] "myfit_mdl: train diagnostics complete: 82.508000 secs"
## Prediction
## Reference N Y
## N 26386 90
## Y 7090 626
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.900094e-01 1.145472e-01 7.856530e-01 7.943164e-01 7.743332e-01
## AccuracyPValue McnemarPValue
## 1.419367e-12 0.000000e+00
## Prediction
## Reference N Y
## N 6590 29
## Y 1779 150
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7884885353 0.1081344011 0.7796751625 0.7971048346 0.7743331774
## AccuracyPValue McnemarPValue
## 0.0008470695 0.0000000000
## [1] "myfit_mdl: predict complete: 119.499000 secs"
## id
## 1 Low.cor.X##rcv#glmnet
## feats
## 1 Datesenttocompany.last16.log1p,Datesenttocompany.last8.log1p,Datesenttocompany.last4.log1p,Datesenttocompany.last32.log1p,Datesenttocompany.last2.log1p,spcPrd.fctr,gCConsent.fctr,Datesenttocompany.year.fctr,Datesenttocompany.juliandate,Datesenttocompany.month.fctr,.pos,Datesenttocompany.wkend,Sent.Recd.Dys.nexp,Datesenttocompany.date.fctr,.rnorm,Rgn.Dvn.fctr,gTags.fctr,Datesenttocompany.wkday.fctr,Sent.Recd.Dys,Sent.Recd.Dys.root2,Channel.fctr,Response.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 20 81.479 2.295
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5282722 0.9991313 0.05741317 0.6063701
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.45 0.148482 0.7864315
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.785653 0.7943164 0.08402946
## min.log.loss.mean.fit max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB
## 1 0.5142341 0.5289447 0.9987914 0.05909798
## max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.6064605 0.45 0.142315 0.7884885
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7796752 0.7971048 0.1081344
## min.log.loss.mean.OOB max.AccuracySD.fit max.KappaSD.fit
## 1 0.5142419 0.001929325 0.01155837
## [1] "myfit_mdl: exit: 119.985000 secs"
fit.models_0_chunk_df <-
myadd_chunk(fit.models_0_chunk_df, "fit.models_0_end", major.inc = FALSE,
label.minor = "teardown")
## label step_major step_minor label_minor bgn end
## 7 fit.models_0_Low.cor.X 1 6 glmnet 254.719 374.833
## 8 fit.models_0_end 1 7 teardown 374.833 NA
## elapsed
## 7 120.114
## 8 NA
rm(ret_lst)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc = FALSE)
## label step_major step_minor label_minor bgn end elapsed
## 2 fit.models 2 0 0 22.460 374.899 352.439
## 3 fit.models 2 1 1 374.899 NA NA
if (!is.null(glbChunks$first) && (glbChunks$first == "fit.models_1") &&
(is.null(knitr::opts_current$get(name = 'label')))) # not knitting
myloadChunk(glbChunks$inpFilePathName, keepSpec = c(NULL), dropSpec = c(NULL))
## label step_major step_minor label_minor bgn end elapsed
## 1 fit.models_1_bgn 1 0 setup 379.827 NA NA
## label step_major step_minor label_minor bgn end
## 1 fit.models_1_bgn 1 0 setup 379.827 379.835
## 2 fit.models_1_All.X 1 1 setup 379.836 NA
## elapsed
## 1 0.008
## 2 NA
## label step_major step_minor label_minor bgn end
## 2 fit.models_1_All.X 1 1 setup 379.836 379.84
## 3 fit.models_1_All.X 1 2 glmnet 379.840 NA
## elapsed
## 2 0.004
## 3 NA
## [1] "myfit_mdl: enter: 0.001000 secs"
## [1] "myfit_mdl: fitting model: All.X##rcv#glmnet"
## [1] " indepVar: Datereceived.last16.log1p,Datesenttocompany.last16.log1p,Datereceived.last8.log1p,Datesenttocompany.last8.log1p,Datereceived.last4.log1p,Datesenttocompany.last4.log1p,Datereceived.last32.log1p,Datesenttocompany.last32.log1p,Datereceived.last2.log1p,Datesenttocompany.last2.log1p,ComplaintID,spcPrd.fctr,gCConsent.fctr,Datereceived.year.fctr,Datesenttocompany.year.fctr,Datereceived.juliandate,Datesenttocompany.juliandate,Datereceived.month.fctr,Datesenttocompany.month.fctr,.pos,Datereceived.wkend,Datesenttocompany.wkend,Sent.Recd.Dys.nexp,Datereceived.date.fctr,Datesenttocompany.date.fctr,.rnorm,Rgn.Dvn.fctr,gTags.fctr,Datereceived.wkday.fctr,Datesenttocompany.wkday.fctr,Sent.Recd.Dys,Sent.Recd.Dys.root2,Sent.Recd.Dys.log1p,Channel.fctr,Response.fctr"
## [1] "myfit_mdl: setup complete: 0.599000 secs"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.000144 on full training set
## [1] "myfit_mdl: train complete: 103.854000 secs"
## Length Class Mode
## a0 51 -none- numeric
## beta 5355 dgCMatrix S4
## df 51 -none- numeric
## dim 2 -none- numeric
## lambda 51 -none- numeric
## dev.ratio 51 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 105 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -2.779835e+00
## .pos
## 1.733998e-06
## .rnorm
## 1.273158e-02
## Channel.fctrEmail
## 1.725324e-01
## Channel.fctrFax
## 3.069697e-01
## Channel.fctrPhone
## -3.244746e-01
## Channel.fctrPostalmail
## -2.763321e-01
## Channel.fctrReferral
## -2.721436e-01
## ComplaintID
## 2.224254e-07
## Datereceived.date.fctr(13,19]
## 2.318821e-02
## Datereceived.date.fctr(19,25]
## 8.029148e-03
## Datereceived.date.fctr(25,31]
## 4.351843e-02
## Datereceived.juliandate
## 2.246878e-04
## Datereceived.last16.log1p
## 2.574684e-02
## Datereceived.last2.log1p
## 7.008923e-03
## Datereceived.last32.log1p
## 2.396844e-02
## Datereceived.last4.log1p
## 8.346374e-03
## Datereceived.last8.log1p
## 1.854914e-02
## Datereceived.month.fctr03
## 6.968494e-02
## Datereceived.month.fctr04
## -1.539828e-02
## Datereceived.month.fctr05
## -8.975999e-02
## Datereceived.month.fctr06
## 9.075259e-03
## Datereceived.month.fctr07
## 4.681129e-02
## Datereceived.month.fctr08
## 2.246360e-01
## Datereceived.month.fctr09
## 1.650656e-01
## Datereceived.month.fctr10
## 7.004007e-02
## Datereceived.month.fctr11
## -6.473866e-05
## Datereceived.month.fctr12
## -7.599654e-02
## Datereceived.wkday.fctr1
## -1.783201e-02
## Datereceived.wkday.fctr2
## 8.772174e-02
## Datereceived.wkday.fctr3
## 1.616145e-02
## Datereceived.wkday.fctr5
## -4.323282e-02
## Datereceived.wkday.fctr6
## 3.459055e-02
## Datereceived.wkend
## -2.768417e-01
## Datereceived.year.fctr2013
## 1.251975e-02
## Datereceived.year.fctr2014
## 2.290299e-02
## Datereceived.year.fctr2015
## 1.223159e-01
## Datereceived.year.fctr2016
## -1.112891e-03
## Datesenttocompany.date.fctr(13,19]
## 1.481261e-02
## Datesenttocompany.date.fctr(19,25]
## 4.277123e-03
## Datesenttocompany.date.fctr(25,31]
## 4.802578e-03
## Datesenttocompany.juliandate
## 4.563628e-07
## Datesenttocompany.last16.log1p
## 1.197858e-02
## Datesenttocompany.last2.log1p
## 2.774536e-03
## Datesenttocompany.last32.log1p
## 1.346203e-02
## Datesenttocompany.last4.log1p
## 3.673185e-03
## Datesenttocompany.last8.log1p
## 1.208622e-02
## Datesenttocompany.month.fctr03
## 1.661770e-02
## Datesenttocompany.month.fctr05
## -7.933661e-03
## Datesenttocompany.month.fctr06
## 4.453455e-03
## Datesenttocompany.month.fctr07
## 2.515360e-02
## Datesenttocompany.month.fctr08
## 1.558850e-01
## Datesenttocompany.month.fctr09
## 8.862737e-02
## Datesenttocompany.month.fctr10
## 8.477540e-03
## Datesenttocompany.month.fctr12
## -1.598533e-02
## Datesenttocompany.wkday.fctr1
## -4.530856e-04
## Datesenttocompany.wkday.fctr2
## 8.374467e-03
## Datesenttocompany.wkday.fctr5
## -1.866043e-02
## Datesenttocompany.wkday.fctr6
## 2.662482e-02
## Datesenttocompany.wkend
## -1.228178e-01
## Datesenttocompany.year.fctr2013
## 1.918404e-02
## Datesenttocompany.year.fctr2014
## 1.009962e-02
## Datesenttocompany.year.fctr2015
## 3.091480e-02
## Response.fctrClosed
## 1.016203e-02
## Response.fctrClosedwithmonetaryrelief
## -7.254810e-01
## Response.fctrClosedwithnon-monetaryrelief
## -6.629208e-01
## Response.fctrClosedwithoutrelief
## 8.565959e-02
## Response.fctrClosedwithrelief
## -3.769168e-01
## Rgn.Dvn.fctrMW#EastNorthCentral
## -5.089739e-02
## Rgn.Dvn.fctrMW#WestNorthCentral
## -1.374717e-01
## Rgn.Dvn.fctrMW#WestNorthCentral#MidAtlantic
## -3.453292e-02
## Rgn.Dvn.fctrMW#WestNorthCentral#NewEngland
## 6.836228e-03
## Rgn.Dvn.fctrOT#Other
## 1.051754e-01
## Rgn.Dvn.fctrSH#EastSouthCentral
## -8.196752e-02
## Rgn.Dvn.fctrSH#WestSouthCentral
## -6.438879e-02
## Rgn.Dvn.fctrWT#Mountain
## 2.874673e-02
## Rgn.Dvn.fctrWT#Pacific
## 2.190319e-02
## Sent.Recd.Dys
## -1.816448e-03
## Sent.Recd.Dys.nexp
## -3.779186e-03
## Sent.Recd.Dys.root2
## -1.181396e-02
## gCConsent.fctrConsentnotprovided
## -2.398871e-01
## gCConsent.fctrConsentprovided
## -4.903913e-02
## gCConsent.fctrOther
## -1.565443e-01
## gTags.fctrOlderAmerican
## 1.272591e-01
## gTags.fctrServicemember
## -9.147088e-02
## spcPrd.fctrConsumerLoan#Installmentloan
## 8.576589e-02
## spcPrd.fctrConsumerLoan#Pawnloan
## 2.875993e-01
## spcPrd.fctrConsumerLoan#Personallineofcredit
## 1.312179e-01
## spcPrd.fctrConsumerLoan#Vehiclelease
## 2.917392e-01
## spcPrd.fctrConsumerLoan#Vehicleloan
## 1.291155e-01
## spcPrd.fctrStudentloan#Federalstudentloanservicing
## 2.337204e-01
## [1] "max lambda < lambdaOpt:"
## [1] "Feats mismatch between coefs_left & rght:"
## [1] "(Intercept)"
## [2] ".pos"
## [3] ".rnorm"
## [4] "Channel.fctrEmail"
## [5] "Channel.fctrFax"
## [6] "Channel.fctrPhone"
## [7] "Channel.fctrPostalmail"
## [8] "Channel.fctrReferral"
## [9] "ComplaintID"
## [10] "Datereceived.date.fctr(7,13]"
## [11] "Datereceived.date.fctr(13,19]"
## [12] "Datereceived.date.fctr(19,25]"
## [13] "Datereceived.date.fctr(25,31]"
## [14] "Datereceived.juliandate"
## [15] "Datereceived.last16.log1p"
## [16] "Datereceived.last2.log1p"
## [17] "Datereceived.last32.log1p"
## [18] "Datereceived.last4.log1p"
## [19] "Datereceived.last8.log1p"
## [20] "Datereceived.month.fctr02"
## [21] "Datereceived.month.fctr03"
## [22] "Datereceived.month.fctr04"
## [23] "Datereceived.month.fctr05"
## [24] "Datereceived.month.fctr06"
## [25] "Datereceived.month.fctr07"
## [26] "Datereceived.month.fctr08"
## [27] "Datereceived.month.fctr09"
## [28] "Datereceived.month.fctr10"
## [29] "Datereceived.month.fctr11"
## [30] "Datereceived.month.fctr12"
## [31] "Datereceived.wkday.fctr1"
## [32] "Datereceived.wkday.fctr2"
## [33] "Datereceived.wkday.fctr3"
## [34] "Datereceived.wkday.fctr4"
## [35] "Datereceived.wkday.fctr5"
## [36] "Datereceived.wkday.fctr6"
## [37] "Datereceived.wkend"
## [38] "Datereceived.year.fctr2013"
## [39] "Datereceived.year.fctr2014"
## [40] "Datereceived.year.fctr2015"
## [41] "Datereceived.year.fctr2016"
## [42] "Datesenttocompany.date.fctr(7,13]"
## [43] "Datesenttocompany.date.fctr(13,19]"
## [44] "Datesenttocompany.date.fctr(19,25]"
## [45] "Datesenttocompany.date.fctr(25,31]"
## [46] "Datesenttocompany.juliandate"
## [47] "Datesenttocompany.last16.log1p"
## [48] "Datesenttocompany.last2.log1p"
## [49] "Datesenttocompany.last32.log1p"
## [50] "Datesenttocompany.last4.log1p"
## [51] "Datesenttocompany.last8.log1p"
## [52] "Datesenttocompany.month.fctr02"
## [53] "Datesenttocompany.month.fctr03"
## [54] "Datesenttocompany.month.fctr04"
## [55] "Datesenttocompany.month.fctr05"
## [56] "Datesenttocompany.month.fctr06"
## [57] "Datesenttocompany.month.fctr07"
## [58] "Datesenttocompany.month.fctr08"
## [59] "Datesenttocompany.month.fctr09"
## [60] "Datesenttocompany.month.fctr10"
## [61] "Datesenttocompany.month.fctr11"
## [62] "Datesenttocompany.month.fctr12"
## [63] "Datesenttocompany.wkday.fctr1"
## [64] "Datesenttocompany.wkday.fctr2"
## [65] "Datesenttocompany.wkday.fctr3"
## [66] "Datesenttocompany.wkday.fctr4"
## [67] "Datesenttocompany.wkday.fctr5"
## [68] "Datesenttocompany.wkday.fctr6"
## [69] "Datesenttocompany.wkend"
## [70] "Datesenttocompany.year.fctr2013"
## [71] "Datesenttocompany.year.fctr2014"
## [72] "Datesenttocompany.year.fctr2015"
## [73] "Datesenttocompany.year.fctr2016"
## [74] "Response.fctrClosed"
## [75] "Response.fctrClosedwithmonetaryrelief"
## [76] "Response.fctrClosedwithnon-monetaryrelief"
## [77] "Response.fctrClosedwithoutrelief"
## [78] "Response.fctrClosedwithrelief"
## [79] "Response.fctrInprogress"
## [80] "Response.fctrUntimelyresponse"
## [81] "Rgn.Dvn.fctrMW#EastNorthCentral"
## [82] "Rgn.Dvn.fctrMW#WestNorthCentral"
## [83] "Rgn.Dvn.fctrMW#WestNorthCentral#MidAtlantic"
## [84] "Rgn.Dvn.fctrMW#WestNorthCentral#NewEngland"
## [85] "Rgn.Dvn.fctrOT#Other"
## [86] "Rgn.Dvn.fctrSH#EastSouthCentral"
## [87] "Rgn.Dvn.fctrSH#WestSouthCentral"
## [88] "Rgn.Dvn.fctrWT#Mountain"
## [89] "Rgn.Dvn.fctrWT#Pacific"
## [90] "Sent.Recd.Dys"
## [91] "Sent.Recd.Dys.log1p"
## [92] "Sent.Recd.Dys.nexp"
## [93] "Sent.Recd.Dys.root2"
## [94] "gCConsent.fctrConsentnotprovided"
## [95] "gCConsent.fctrConsentprovided"
## [96] "gCConsent.fctrOther"
## [97] "gTags.fctrOlderAmerican"
## [98] "gTags.fctrOlderAmericanServicemember"
## [99] "gTags.fctrServicemember"
## [100] "spcPrd.fctrConsumerLoan#Installmentloan"
## [101] "spcPrd.fctrConsumerLoan#Pawnloan"
## [102] "spcPrd.fctrConsumerLoan#Personallineofcredit"
## [103] "spcPrd.fctrConsumerLoan#Titleloan"
## [104] "spcPrd.fctrConsumerLoan#Vehiclelease"
## [105] "spcPrd.fctrConsumerLoan#Vehicleloan"
## [106] "spcPrd.fctrStudentloan#Federalstudentloanservicing"
## [1] "myfit_mdl: train diagnostics complete: 104.362000 secs"
## Prediction
## Reference N Y
## N 26391 85
## Y 7093 623
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.900679e-01 1.143144e-01 7.857119e-01 7.943745e-01 7.743332e-01
## AccuracyPValue McnemarPValue
## 1.176293e-12 0.000000e+00
## Prediction
## Reference N Y
## N 6591 28
## Y 1783 146
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.788137576 0.105443974 0.779319074 0.796759245 0.774333177
## AccuracyPValue McnemarPValue
## 0.001105263 0.000000000
## [1] "myfit_mdl: predict complete: 144.295000 secs"
## id
## 1 All.X##rcv#glmnet
## feats
## 1 Datereceived.last16.log1p,Datesenttocompany.last16.log1p,Datereceived.last8.log1p,Datesenttocompany.last8.log1p,Datereceived.last4.log1p,Datesenttocompany.last4.log1p,Datereceived.last32.log1p,Datesenttocompany.last32.log1p,Datereceived.last2.log1p,Datesenttocompany.last2.log1p,ComplaintID,spcPrd.fctr,gCConsent.fctr,Datereceived.year.fctr,Datesenttocompany.year.fctr,Datereceived.juliandate,Datesenttocompany.juliandate,Datereceived.month.fctr,Datesenttocompany.month.fctr,.pos,Datereceived.wkend,Datesenttocompany.wkend,Sent.Recd.Dys.nexp,Datereceived.date.fctr,Datesenttocompany.date.fctr,.rnorm,Rgn.Dvn.fctr,gTags.fctr,Datereceived.wkday.fctr,Datesenttocompany.wkday.fctr,Sent.Recd.Dys,Sent.Recd.Dys.root2,Sent.Recd.Dys.log1p,Channel.fctr,Response.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 103.08 2.634
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5277727 0.9991691 0.05637636 0.606051
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.45 0.1479107 0.786373
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7857119 0.7943745 0.08343923
## min.log.loss.mean.fit max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB
## 1 0.5142415 0.5272813 0.9990935 0.05546916
## max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.6057797 0.45 0.1388493 0.7881376
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7793191 0.7967592 0.105444
## min.log.loss.mean.OOB max.AccuracySD.fit max.KappaSD.fit
## 1 0.5143124 0.001940243 0.0117971
## [1] "myfit_mdl: exit: 145.038000 secs"
## label step_major step_minor label_minor bgn end
## 3 fit.models_1_All.X 1 2 glmnet 379.840 524.881
## 4 fit.models_1_All.X 1 3 glm 524.882 NA
## elapsed
## 3 145.041
## 4 NA
## [1] "myfit_mdl: enter: 0.000000 secs"
## [1] "myfit_mdl: fitting model: All.X##rcv#glm"
## [1] " indepVar: Datereceived.last16.log1p,Datesenttocompany.last16.log1p,Datereceived.last8.log1p,Datesenttocompany.last8.log1p,Datereceived.last4.log1p,Datesenttocompany.last4.log1p,Datereceived.last32.log1p,Datesenttocompany.last32.log1p,Datereceived.last2.log1p,Datesenttocompany.last2.log1p,ComplaintID,spcPrd.fctr,gCConsent.fctr,Datereceived.year.fctr,Datesenttocompany.year.fctr,Datereceived.juliandate,Datesenttocompany.juliandate,Datereceived.month.fctr,Datesenttocompany.month.fctr,.pos,Datereceived.wkend,Datesenttocompany.wkend,Sent.Recd.Dys.nexp,Datereceived.date.fctr,Datesenttocompany.date.fctr,.rnorm,Rgn.Dvn.fctr,gTags.fctr,Datereceived.wkday.fctr,Datesenttocompany.wkday.fctr,Sent.Recd.Dys,Sent.Recd.Dys.root2,Sent.Recd.Dys.log1p,Channel.fctr,Response.fctr"
## [1] "myfit_mdl: setup complete: 0.723000 secs"
## Aggregating results
## Fitting final model on full training set
## [1] "myfit_mdl: train complete: 44.838000 secs"
## parameter
## 1 none
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3299 -0.7463 -0.6412 -0.4475 2.5831
##
## Coefficients: (35 not defined because of singularities)
## Estimate Std. Error
## (Intercept) -3.113e+00 2.121e-01
## .pos 1.793e-06 1.212e-07
## .rnorm 1.433e-02 1.319e-02
## Channel.fctrEmail 2.179e-01 4.618e-01
## Channel.fctrFax 2.909e-01 1.402e-01
## Channel.fctrPhone -3.627e-01 6.173e-02
## Channel.fctrPostalmail -3.186e-01 8.483e-02
## Channel.fctrReferral -3.099e-01 5.427e-02
## ComplaintID 1.057e-06 6.306e-07
## `Datereceived.date.fctr(7,13]` 8.508e-02 6.366e-02
## `Datereceived.date.fctr(13,19]` 1.895e-01 1.021e-01
## `Datereceived.date.fctr(19,25]` 2.283e-01 1.446e-01
## `Datereceived.date.fctr(25,31]` 3.273e-01 1.874e-01
## Datereceived.juliandate -1.172e-02 7.583e-03
## Datereceived.last16.log1p 3.864e-02 3.525e-03
## Datereceived.last2.log1p 1.014e-02 5.574e-03
## Datereceived.last32.log1p 3.876e-02 3.992e-03
## Datereceived.last4.log1p 1.247e-02 4.971e-03
## Datereceived.last8.log1p 3.126e-02 3.915e-03
## Datereceived.month.fctr02 3.389e-01 2.446e-01
## Datereceived.month.fctr03 7.418e-01 4.538e-01
## Datereceived.month.fctr04 9.672e-01 6.845e-01
## Datereceived.month.fctr05 1.207e+00 9.109e-01
## Datereceived.month.fctr06 1.661e+00 1.143e+00
## Datereceived.month.fctr07 2.045e+00 1.370e+00
## Datereceived.month.fctr08 2.690e+00 1.603e+00
## Datereceived.month.fctr09 2.898e+00 1.836e+00
## Datereceived.month.fctr10 3.057e+00 2.062e+00
## Datereceived.month.fctr11 3.302e+00 2.297e+00
## Datereceived.month.fctr12 3.533e+00 2.522e+00
## Datereceived.wkday.fctr1 4.162e-01 7.143e-02
## Datereceived.wkday.fctr2 5.420e-01 7.126e-02
## Datereceived.wkday.fctr3 4.602e-01 7.166e-02
## Datereceived.wkday.fctr4 4.422e-01 7.191e-02
## Datereceived.wkday.fctr5 3.675e-01 7.214e-02
## Datereceived.wkday.fctr6 9.199e-02 7.820e-02
## Datereceived.wkend NA NA
## Datereceived.year.fctr2013 -2.722e-01 2.544e-01
## Datereceived.year.fctr2014 -6.634e-01 5.501e-01
## Datereceived.year.fctr2015 -9.533e-01 8.776e-01
## Datereceived.year.fctr2016 -1.556e+00 1.209e+00
## `Datesenttocompany.date.fctr(7,13]` NA NA
## `Datesenttocompany.date.fctr(13,19]` NA NA
## `Datesenttocompany.date.fctr(19,25]` NA NA
## `Datesenttocompany.date.fctr(25,31]` NA NA
## Datesenttocompany.juliandate NA NA
## Datesenttocompany.last16.log1p NA NA
## Datesenttocompany.last2.log1p NA NA
## Datesenttocompany.last32.log1p NA NA
## Datesenttocompany.last4.log1p NA NA
## Datesenttocompany.last8.log1p NA NA
## Datesenttocompany.month.fctr02 NA NA
## Datesenttocompany.month.fctr03 NA NA
## Datesenttocompany.month.fctr04 NA NA
## Datesenttocompany.month.fctr05 NA NA
## Datesenttocompany.month.fctr06 NA NA
## Datesenttocompany.month.fctr07 NA NA
## Datesenttocompany.month.fctr08 NA NA
## Datesenttocompany.month.fctr09 NA NA
## Datesenttocompany.month.fctr10 NA NA
## Datesenttocompany.month.fctr11 NA NA
## Datesenttocompany.month.fctr12 NA NA
## Datesenttocompany.wkday.fctr1 NA NA
## Datesenttocompany.wkday.fctr2 NA NA
## Datesenttocompany.wkday.fctr3 NA NA
## Datesenttocompany.wkday.fctr4 NA NA
## Datesenttocompany.wkday.fctr5 NA NA
## Datesenttocompany.wkday.fctr6 NA NA
## Datesenttocompany.wkend NA NA
## Datesenttocompany.year.fctr2013 NA NA
## Datesenttocompany.year.fctr2014 NA NA
## Datesenttocompany.year.fctr2015 NA NA
## Datesenttocompany.year.fctr2016 NA NA
## Response.fctrClosed 2.152e-02 1.076e-01
## Response.fctrClosedwithmonetaryrelief -7.382e-01 6.585e-02
## `Response.fctrClosedwithnon-monetaryrelief` -6.728e-01 5.565e-02
## Response.fctrClosedwithoutrelief 3.972e-02 1.240e-01
## Response.fctrClosedwithrelief -4.846e-01 2.612e-01
## Response.fctrInprogress NA NA
## Response.fctrUntimelyresponse NA NA
## `Rgn.Dvn.fctrMW#EastNorthCentral` -5.977e-02 4.475e-02
## `Rgn.Dvn.fctrMW#WestNorthCentral` -1.489e-01 6.818e-02
## `Rgn.Dvn.fctrMW#WestNorthCentral#MidAtlantic` -4.153e-02 4.343e-02
## `Rgn.Dvn.fctrMW#WestNorthCentral#NewEngland` 1.312e-02 6.402e-02
## `Rgn.Dvn.fctrOT#Other` 1.187e-01 1.328e-01
## `Rgn.Dvn.fctrSH#EastSouthCentral` -9.692e-02 6.695e-02
## `Rgn.Dvn.fctrSH#WestSouthCentral` -7.475e-02 5.057e-02
## `Rgn.Dvn.fctrWT#Mountain` 3.015e-02 5.801e-02
## `Rgn.Dvn.fctrWT#Pacific` 2.288e-02 4.337e-02
## Sent.Recd.Dys 5.812e-03 6.184e-03
## Sent.Recd.Dys.log1p 2.461e-01 2.029e-01
## Sent.Recd.Dys.nexp 3.285e-02 7.092e-02
## Sent.Recd.Dys.root2 -2.101e-01 1.523e-01
## gCConsent.fctrConsentnotprovided -2.849e-01 5.916e-02
## gCConsent.fctrConsentprovided -9.142e-02 5.433e-02
## gCConsent.fctrOther -2.208e-01 1.402e-01
## gTags.fctrOlderAmerican 1.317e-01 5.758e-02
## gTags.fctrOlderAmericanServicemember -5.133e-03 1.338e-01
## gTags.fctrServicemember -1.035e-01 5.753e-02
## `spcPrd.fctrConsumerLoan#Installmentloan` 9.812e-02 4.444e-02
## `spcPrd.fctrConsumerLoan#Pawnloan` 3.275e-01 3.098e-01
## `spcPrd.fctrConsumerLoan#Personallineofcredit` 1.554e-01 7.558e-02
## `spcPrd.fctrConsumerLoan#Titleloan` -8.223e-03 1.486e-01
## `spcPrd.fctrConsumerLoan#Vehiclelease` 3.100e-01 6.323e-02
## `spcPrd.fctrConsumerLoan#Vehicleloan` 1.430e-01 3.277e-02
## `spcPrd.fctrStudentloan#Federalstudentloanservicing` 2.624e-01 7.198e-02
## z value Pr(>|z|)
## (Intercept) -14.677 < 2e-16 ***
## .pos 14.797 < 2e-16 ***
## .rnorm 1.087 0.277222
## Channel.fctrEmail 0.472 0.637051
## Channel.fctrFax 2.074 0.038065 *
## Channel.fctrPhone -5.875 4.23e-09 ***
## Channel.fctrPostalmail -3.756 0.000173 ***
## Channel.fctrReferral -5.711 1.13e-08 ***
## ComplaintID 1.676 0.093696 .
## `Datereceived.date.fctr(7,13]` 1.336 0.181404
## `Datereceived.date.fctr(13,19]` 1.856 0.063416 .
## `Datereceived.date.fctr(19,25]` 1.578 0.114479
## `Datereceived.date.fctr(25,31]` 1.746 0.080768 .
## Datereceived.juliandate -1.546 0.122151
## Datereceived.last16.log1p 10.961 < 2e-16 ***
## Datereceived.last2.log1p 1.820 0.068831 .
## Datereceived.last32.log1p 9.707 < 2e-16 ***
## Datereceived.last4.log1p 2.508 0.012139 *
## Datereceived.last8.log1p 7.984 1.41e-15 ***
## Datereceived.month.fctr02 1.385 0.165950
## Datereceived.month.fctr03 1.635 0.102136
## Datereceived.month.fctr04 1.413 0.157701
## Datereceived.month.fctr05 1.325 0.185292
## Datereceived.month.fctr06 1.453 0.146193
## Datereceived.month.fctr07 1.493 0.135407
## Datereceived.month.fctr08 1.678 0.093344 .
## Datereceived.month.fctr09 1.578 0.114546
## Datereceived.month.fctr10 1.483 0.138185
## Datereceived.month.fctr11 1.438 0.150517
## Datereceived.month.fctr12 1.401 0.161265
## Datereceived.wkday.fctr1 5.827 5.65e-09 ***
## Datereceived.wkday.fctr2 7.605 2.85e-14 ***
## Datereceived.wkday.fctr3 6.422 1.34e-10 ***
## Datereceived.wkday.fctr4 6.150 7.76e-10 ***
## Datereceived.wkday.fctr5 5.094 3.50e-07 ***
## Datereceived.wkday.fctr6 1.176 0.239484
## Datereceived.wkend NA NA
## Datereceived.year.fctr2013 -1.070 0.284662
## Datereceived.year.fctr2014 -1.206 0.227816
## Datereceived.year.fctr2015 -1.086 0.277367
## Datereceived.year.fctr2016 -1.287 0.197947
## `Datesenttocompany.date.fctr(7,13]` NA NA
## `Datesenttocompany.date.fctr(13,19]` NA NA
## `Datesenttocompany.date.fctr(19,25]` NA NA
## `Datesenttocompany.date.fctr(25,31]` NA NA
## Datesenttocompany.juliandate NA NA
## Datesenttocompany.last16.log1p NA NA
## Datesenttocompany.last2.log1p NA NA
## Datesenttocompany.last32.log1p NA NA
## Datesenttocompany.last4.log1p NA NA
## Datesenttocompany.last8.log1p NA NA
## Datesenttocompany.month.fctr02 NA NA
## Datesenttocompany.month.fctr03 NA NA
## Datesenttocompany.month.fctr04 NA NA
## Datesenttocompany.month.fctr05 NA NA
## Datesenttocompany.month.fctr06 NA NA
## Datesenttocompany.month.fctr07 NA NA
## Datesenttocompany.month.fctr08 NA NA
## Datesenttocompany.month.fctr09 NA NA
## Datesenttocompany.month.fctr10 NA NA
## Datesenttocompany.month.fctr11 NA NA
## Datesenttocompany.month.fctr12 NA NA
## Datesenttocompany.wkday.fctr1 NA NA
## Datesenttocompany.wkday.fctr2 NA NA
## Datesenttocompany.wkday.fctr3 NA NA
## Datesenttocompany.wkday.fctr4 NA NA
## Datesenttocompany.wkday.fctr5 NA NA
## Datesenttocompany.wkday.fctr6 NA NA
## Datesenttocompany.wkend NA NA
## Datesenttocompany.year.fctr2013 NA NA
## Datesenttocompany.year.fctr2014 NA NA
## Datesenttocompany.year.fctr2015 NA NA
## Datesenttocompany.year.fctr2016 NA NA
## Response.fctrClosed 0.200 0.841544
## Response.fctrClosedwithmonetaryrelief -11.210 < 2e-16 ***
## `Response.fctrClosedwithnon-monetaryrelief` -12.090 < 2e-16 ***
## Response.fctrClosedwithoutrelief 0.320 0.748755
## Response.fctrClosedwithrelief -1.856 0.063497 .
## Response.fctrInprogress NA NA
## Response.fctrUntimelyresponse NA NA
## `Rgn.Dvn.fctrMW#EastNorthCentral` -1.336 0.181699
## `Rgn.Dvn.fctrMW#WestNorthCentral` -2.184 0.028953 *
## `Rgn.Dvn.fctrMW#WestNorthCentral#MidAtlantic` -0.956 0.339013
## `Rgn.Dvn.fctrMW#WestNorthCentral#NewEngland` 0.205 0.837631
## `Rgn.Dvn.fctrOT#Other` 0.894 0.371141
## `Rgn.Dvn.fctrSH#EastSouthCentral` -1.448 0.147710
## `Rgn.Dvn.fctrSH#WestSouthCentral` -1.478 0.139328
## `Rgn.Dvn.fctrWT#Mountain` 0.520 0.603292
## `Rgn.Dvn.fctrWT#Pacific` 0.528 0.597846
## Sent.Recd.Dys 0.940 0.347292
## Sent.Recd.Dys.log1p 1.213 0.225261
## Sent.Recd.Dys.nexp 0.463 0.643178
## Sent.Recd.Dys.root2 -1.379 0.167821
## gCConsent.fctrConsentnotprovided -4.815 1.47e-06 ***
## gCConsent.fctrConsentprovided -1.683 0.092453 .
## gCConsent.fctrOther -1.575 0.115354
## gTags.fctrOlderAmerican 2.287 0.022168 *
## gTags.fctrOlderAmericanServicemember -0.038 0.969405
## gTags.fctrServicemember -1.799 0.072051 .
## `spcPrd.fctrConsumerLoan#Installmentloan` 2.208 0.027242 *
## `spcPrd.fctrConsumerLoan#Pawnloan` 1.057 0.290568
## `spcPrd.fctrConsumerLoan#Personallineofcredit` 2.057 0.039727 *
## `spcPrd.fctrConsumerLoan#Titleloan` -0.055 0.955879
## `spcPrd.fctrConsumerLoan#Vehiclelease` 4.903 9.45e-07 ***
## `spcPrd.fctrConsumerLoan#Vehicleloan` 4.363 1.28e-05 ***
## `spcPrd.fctrStudentloan#Federalstudentloanservicing` 3.646 0.000266 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 36516 on 34191 degrees of freedom
## Residual deviance: 35157 on 34121 degrees of freedom
## AIC: 35299
##
## Number of Fisher Scoring iterations: 4
##
## [1] "mydisplayOutliers: "
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## X121059 2.586102 0.0097068 NA
## [1] ""
## .rstudent .dffits .hatvalues
## Min. :-1.3338 Min. :-0.597293 Min. :0.0005464
## 1st Qu.:-0.7468 1st Qu.:-0.032705 1st Qu.:0.0013623
## Median :-0.6415 Median :-0.024956 Median :0.0017206
## Mean :-0.1573 Mean :-0.006729 Mean :0.0020765
## 3rd Qu.:-0.4476 3rd Qu.:-0.014702 3rd Qu.:0.0022271
## Max. : 2.5861 Max. : 0.416047 Max. :0.3356983
## [1] "myfit_mdl: train diagnostics complete: 50.436000 secs"
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Prediction
## Reference N Y
## N 26377 99
## Y 7075 641
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.901848e-01 1.167214e-01 7.858298e-01 7.944905e-01 7.743332e-01
## AccuracyPValue McnemarPValue
## 8.062204e-13 0.000000e+00
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Prediction
## Reference N Y
## N 6587 32
## Y 1775 154
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7886055218 0.1103132507 0.7797938607 0.7972200290 0.7743331774
## AccuracyPValue McnemarPValue
## 0.0007741656 0.0000000000
## [1] "myfit_mdl: predict complete: 91.934000 secs"
## id
## 1 All.X##rcv#glm
## feats
## 1 Datereceived.last16.log1p,Datesenttocompany.last16.log1p,Datereceived.last8.log1p,Datesenttocompany.last8.log1p,Datereceived.last4.log1p,Datesenttocompany.last4.log1p,Datereceived.last32.log1p,Datesenttocompany.last32.log1p,Datereceived.last2.log1p,Datesenttocompany.last2.log1p,ComplaintID,spcPrd.fctr,gCConsent.fctr,Datereceived.year.fctr,Datesenttocompany.year.fctr,Datereceived.juliandate,Datesenttocompany.juliandate,Datereceived.month.fctr,Datesenttocompany.month.fctr,.pos,Datereceived.wkend,Datesenttocompany.wkend,Sent.Recd.Dys.nexp,Datereceived.date.fctr,Datesenttocompany.date.fctr,.rnorm,Rgn.Dvn.fctr,gTags.fctr,Datereceived.wkday.fctr,Datesenttocompany.wkday.fctr,Sent.Recd.Dys,Sent.Recd.Dys.root2,Sent.Recd.Dys.log1p,Channel.fctr,Response.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 43.953 4.183
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5295197 0.9989047 0.06013478 0.6064428
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.45 0.1516083 0.7868897
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7858298 0.7944905 0.08747701
## min.log.loss.mean.fit max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB
## 1 0.5141175 0.5301651 0.9986403 0.06168999
## max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.6057013 0.45 0.1456265 0.7886055
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7797939 0.79722 0.1103133
## min.log.loss.mean.OOB max.AccuracySD.fit max.KappaSD.fit
## 1 0.5143174 0.001972729 0.01193052
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## [1] "myfit_mdl: exit: 92.810000 secs"
## label step_major step_minor label_minor bgn end
## 4 fit.models_1_All.X 1 3 glm 524.882 617.856
## 5 fit.models_1_preProc 1 4 preProc 617.856 NA
## elapsed
## 4 92.974
## 5 NA
## Loading required package: gdata
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
## The following objects are masked from 'package:dplyr':
##
## combine, first, last
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
## max.Accuracy.OOB max.AUCROCR.OOB
## Interact.High.cor.Y##rcv#glmnet 0.7927000 0.5771830
## All.X##rcv#glm 0.7886055 0.6057013
## Low.cor.X##rcv#glmnet 0.7884885 0.6064605
## All.X##rcv#glmnet 0.7881376 0.6057797
## Max.cor.Y.Time.Lag##rcv#glmnet 0.7743332 0.5583842
## Max.cor.Y.rcv.1X1###glmnet 0.7743332 0.5520402
## Random###myrandom_classfr 0.7743332 0.5080821
## MFO###myMFO_classfr 0.7743332 0.5000000
## Max.cor.Y##rcv#rpart 0.7743332 0.5000000
## max.AUCpROC.OOB min.elapsedtime.everything
## Interact.High.cor.Y##rcv#glmnet 0.5447352 61.686
## All.X##rcv#glm 0.5301651 43.953
## Low.cor.X##rcv#glmnet 0.5289447 81.479
## All.X##rcv#glmnet 0.5272813 103.080
## Max.cor.Y.Time.Lag##rcv#glmnet 0.5000000 9.539
## Max.cor.Y.rcv.1X1###glmnet 0.5000000 0.752
## Random###myrandom_classfr 0.4927907 0.301
## MFO###myMFO_classfr 0.5000000 0.296
## Max.cor.Y##rcv#rpart 0.5000000 2.376
## max.Accuracy.fit
## Interact.High.cor.Y##rcv#glmnet 0.7919981
## All.X##rcv#glm 0.7868897
## Low.cor.X##rcv#glmnet 0.7864315
## All.X##rcv#glmnet 0.7863730
## Max.cor.Y.Time.Lag##rcv#glmnet 0.7743332
## Max.cor.Y.rcv.1X1###glmnet 0.7743332
## Random###myrandom_classfr 0.7743332
## MFO###myMFO_classfr 0.7743332
## Max.cor.Y##rcv#rpart 0.7743332
## label step_major step_minor label_minor bgn end
## 5 fit.models_1_preProc 1 4 preProc 617.856 619.22
## 6 fit.models_1_end 1 5 teardown 619.221 NA
## elapsed
## 5 1.364
## 6 NA
## label step_major step_minor label_minor bgn end elapsed
## 3 fit.models 2 1 1 374.899 619.226 244.328
## 4 fit.models 2 2 2 619.227 NA NA
## label step_major step_minor label_minor bgn end elapsed
## 1 fit.models_2_bgn 1 0 setup 626.495 NA NA
## Warning: max.AccuracyUpper.fit already exists in glb_models_df
## [1] "var:max.KappaSD.fit"
## Loading required package: RColorBrewer
## Warning: Removed 5 rows containing missing values (geom_errorbar).
## quartz_off_screen
## 2
## Warning: Removed 5 rows containing missing values (geom_errorbar).
## id max.Accuracy.OOB max.AUCROCR.OOB
## 6 Interact.High.cor.Y##rcv#glmnet 0.7927000 0.5771830
## 9 All.X##rcv#glm 0.7886055 0.6057013
## 7 Low.cor.X##rcv#glmnet 0.7884885 0.6064605
## 8 All.X##rcv#glmnet 0.7881376 0.6057797
## 5 Max.cor.Y.Time.Lag##rcv#glmnet 0.7743332 0.5583842
## 3 Max.cor.Y.rcv.1X1###glmnet 0.7743332 0.5520402
## 2 Random###myrandom_classfr 0.7743332 0.5080821
## 1 MFO###myMFO_classfr 0.7743332 0.5000000
## 4 Max.cor.Y##rcv#rpart 0.7743332 0.5000000
## max.AUCpROC.OOB min.elapsedtime.everything max.Accuracy.fit
## 6 0.5447352 61.686 0.7919981
## 9 0.5301651 43.953 0.7868897
## 7 0.5289447 81.479 0.7864315
## 8 0.5272813 103.080 0.7863730
## 5 0.5000000 9.539 0.7743332
## 3 0.5000000 0.752 0.7743332
## 2 0.4927907 0.301 0.7743332
## 1 0.5000000 0.296 0.7743332
## 4 0.5000000 2.376 0.7743332
## opt.prob.threshold.fit opt.prob.threshold.OOB
## 6 0.50 0.50
## 9 0.45 0.45
## 7 0.45 0.45
## 8 0.45 0.45
## 5 0.50 0.50
## 3 0.50 0.50
## 2 0.80 0.80
## 1 0.50 0.50
## 4 0.50 0.50
## [1] "Metrics used for model selection:"
## ~-max.Accuracy.OOB - max.AUCROCR.OOB - max.AUCpROC.OOB + min.elapsedtime.everything -
## max.Accuracy.fit - opt.prob.threshold.OOB
## <environment: 0x7fccb6637470>
## [1] "Best model id: Interact.High.cor.Y##rcv#glmnet"
## glmnet
##
## 34192 samples
## 13 predictor
## 2 classes: 'N', 'Y'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold, repeated 3 times)
## Summary of sample sizes: 29307, 29308, 29308, 29308, 29306, 29308, ...
## Resampling results across tuning parameters:
##
## alpha lambda Accuracy Kappa
## 0.100 6.251398e-05 0.7919883 0.12554995
## 0.100 2.901642e-04 0.7919883 0.12554995
## 0.100 1.346823e-03 0.7919591 0.12536904
## 0.100 6.251398e-03 0.7907503 0.11415551
## 0.100 2.901642e-02 0.7813816 0.04779839
## 0.325 6.251398e-05 0.7919104 0.12534360
## 0.325 2.901642e-04 0.7919104 0.12534360
## 0.325 1.346823e-03 0.7915984 0.12198438
## 0.325 6.251398e-03 0.7877184 0.09133621
## 0.325 2.901642e-02 0.7743332 0.00000000
## 0.550 6.251398e-05 0.7919981 0.12577169
## 0.550 2.901642e-04 0.7919981 0.12577169
## 0.550 1.346823e-03 0.7911695 0.11842180
## 0.550 6.251398e-03 0.7852812 0.07447617
## 0.550 2.901642e-02 0.7743332 0.00000000
## 0.775 6.251398e-05 0.7919883 0.12571205
## 0.775 2.901642e-04 0.7919883 0.12571205
## 0.775 1.346823e-03 0.7907405 0.11441774
## 0.775 6.251398e-03 0.7825710 0.05624026
## 0.775 2.901642e-02 0.7743332 0.00000000
## 1.000 6.251398e-05 0.7919884 0.12563469
## 1.000 2.901642e-04 0.7919494 0.12543096
## 1.000 1.346823e-03 0.7901946 0.11017559
## 1.000 6.251398e-03 0.7797438 0.03725118
## 1.000 2.901642e-02 0.7743332 0.00000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0.55 and lambda
## = 0.0002901642.
## [1] "Interact.High.cor.Y##rcv#glmnet fit prediction diagnostics:"
## [1] "Interact.High.cor.Y##rcv#glmnet OOB prediction diagnostics:"
## Interact.High.cor.Y..rcv.glmnet.imp
## Datereceived.last16.log1p:Datereceived.year.fctr2016 1.000000e+02
## Datereceived.last16.log1p:Datereceived.month.fctr08 7.069522e+01
## Datereceived.last16.log1p:Datereceived.month.fctr09 5.416382e+01
## Datereceived.last16.log1p:Datereceived.month.fctr07 4.644842e+01
## Datereceived.last16.log1p:Datereceived.month.fctr06 3.383224e+01
## Datereceived.last16.log1p:Datereceived.year.fctr2015 3.129653e+01
## Datereceived.last8.log1p 2.918189e+01
## Datereceived.last16.log1p:Datereceived.year.fctr2014 2.885588e+01
## Datereceived.last16.log1p:Datereceived.month.fctr05 2.795287e+01
## Datereceived.last16.log1p:Datereceived.month.fctr12 2.700819e+01
## Datereceived.last16.log1p:Datereceived.wkend 2.242858e+01
## Datereceived.last16.log1p:Datereceived.wkday.fctr2 2.133078e+01
## Datereceived.last16.log1p:Datereceived.month.fctr02 1.720715e+01
## Datereceived.last16.log1p:Datereceived.date.fctr(25,31] 1.629172e+01
## Datereceived.last16.log1p:Datereceived.date.fctr(19,25] 1.006684e+01
## Datereceived.last16.log1p:Datereceived.month.fctr10 9.643381e+00
## Datereceived.last16.log1p:Datereceived.date.fctr(7,13] 7.828063e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr4 6.297452e+00
## Datereceived.last16.log1p:Datereceived.date.fctr(13,19] 6.130789e+00
## Datereceived.last16.log1p:Datereceived.month.fctr11 4.863029e+00
## Datereceived.last16.log1p:Datereceived.last32.log1p 4.376526e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr5 4.346394e+00
## Datereceived.last16.log1p:Datereceived.month.fctr03 3.711741e+00
## Datereceived.last16.log1p:Sent.Recd.Dys.log1p 3.244713e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr1 3.065406e+00
## Datereceived.last16.log1p 2.011325e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr6 1.455873e+00
## Datereceived.last16.log1p:Datereceived.last2.log1p 7.886884e-01
## Datereceived.last16.log1p:Datereceived.last4.log1p 5.938799e-01
## Datereceived.last16.log1p:Datereceived.year.fctr2013 5.068562e-01
## Datereceived.last16.log1p:Datereceived.juliandate 1.453330e-01
## Datereceived.last16.log1p:ComplaintID 3.378702e-05
## Datereceived.last16.log1p:Datereceived.last8.log1p 0.000000e+00
## Datereceived.last16.log1p:Datereceived.month.fctr04 0.000000e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr3 0.000000e+00
## imp
## Datereceived.last16.log1p:Datereceived.year.fctr2016 1.000000e+02
## Datereceived.last16.log1p:Datereceived.month.fctr08 7.069522e+01
## Datereceived.last16.log1p:Datereceived.month.fctr09 5.416382e+01
## Datereceived.last16.log1p:Datereceived.month.fctr07 4.644842e+01
## Datereceived.last16.log1p:Datereceived.month.fctr06 3.383224e+01
## Datereceived.last16.log1p:Datereceived.year.fctr2015 3.129653e+01
## Datereceived.last8.log1p 2.918189e+01
## Datereceived.last16.log1p:Datereceived.year.fctr2014 2.885588e+01
## Datereceived.last16.log1p:Datereceived.month.fctr05 2.795287e+01
## Datereceived.last16.log1p:Datereceived.month.fctr12 2.700819e+01
## Datereceived.last16.log1p:Datereceived.wkend 2.242858e+01
## Datereceived.last16.log1p:Datereceived.wkday.fctr2 2.133078e+01
## Datereceived.last16.log1p:Datereceived.month.fctr02 1.720715e+01
## Datereceived.last16.log1p:Datereceived.date.fctr(25,31] 1.629172e+01
## Datereceived.last16.log1p:Datereceived.date.fctr(19,25] 1.006684e+01
## Datereceived.last16.log1p:Datereceived.month.fctr10 9.643381e+00
## Datereceived.last16.log1p:Datereceived.date.fctr(7,13] 7.828063e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr4 6.297452e+00
## Datereceived.last16.log1p:Datereceived.date.fctr(13,19] 6.130789e+00
## Datereceived.last16.log1p:Datereceived.month.fctr11 4.863029e+00
## Datereceived.last16.log1p:Datereceived.last32.log1p 4.376526e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr5 4.346394e+00
## Datereceived.last16.log1p:Datereceived.month.fctr03 3.711741e+00
## Datereceived.last16.log1p:Sent.Recd.Dys.log1p 3.244713e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr1 3.065406e+00
## Datereceived.last16.log1p 2.011325e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr6 1.455873e+00
## Datereceived.last16.log1p:Datereceived.last2.log1p 7.886884e-01
## Datereceived.last16.log1p:Datereceived.last4.log1p 5.938799e-01
## Datereceived.last16.log1p:Datereceived.year.fctr2013 5.068562e-01
## Datereceived.last16.log1p:Datereceived.juliandate 1.453330e-01
## Datereceived.last16.log1p:ComplaintID 3.378702e-05
## Datereceived.last16.log1p:Datereceived.last8.log1p 0.000000e+00
## Datereceived.last16.log1p:Datereceived.month.fctr04 0.000000e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr3 0.000000e+00
## Warning in glb_analytics_diag_plots(obs_df = glbObsOOB, mdl_id =
## glbMdlSltId, : Limiting important feature scatter plots to 5 out of 14
## Loading required package: lazyeval
## [1] "Min/Max Boundaries: "
## ComplaintID CDisputed.fctr
## 1 1492647 Y
## 2 719174 Y
## 3 738765 Y
## 4 731206 Y
## 5 734695 Y
## 6 742427 Y
## 7 146923 Y
## 8 146026 Y
## 9 925067 N
## 10 1057694 N
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.prob
## 1 0.1899589
## 2 0.1899589
## 3 0.1899589
## 4 0.1899589
## 5 0.1899589
## 6 0.1899589
## 7 0.2813806
## 8 0.3215605
## 9 0.2775021
## 10 0.1899589
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet
## 1 N
## 2 N
## 3 N
## 4 N
## 5 N
## 6 N
## 7 N
## 8 N
## 9 N
## 10 N
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.err
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
## 7 TRUE
## 8 TRUE
## 9 FALSE
## 10 FALSE
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.err.abs
## 1 0.8100411
## 2 0.8100411
## 3 0.8100411
## 4 0.8100411
## 5 0.8100411
## 6 0.8100411
## 7 0.7186194
## 8 0.6784395
## 9 0.2775021
## 10 0.1899589
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.is.acc
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## 7 FALSE
## 8 FALSE
## 9 TRUE
## 10 TRUE
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.accurate
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## 7 FALSE
## 8 FALSE
## 9 TRUE
## 10 TRUE
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.error .label
## 1 -0.3100411 1492647
## 2 -0.3100411 719174
## 3 -0.3100411 738765
## 4 -0.3100411 731206
## 5 -0.3100411 734695
## 6 -0.3100411 742427
## 7 -0.2186194 146923
## 8 -0.1784395 146026
## 9 0.0000000 925067
## 10 0.0000000 1057694
## [1] "Inaccurate: "
## ComplaintID CDisputed.fctr
## 1 701765 Y
## 2 1230459 Y
## 3 1179956 Y
## 4 1240855 Y
## 5 702748 Y
## 6 308655 Y
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.prob
## 1 0.1168213
## 2 0.1236901
## 3 0.1250713
## 4 0.1261765
## 5 0.1272616
## 6 0.1281071
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet
## 1 N
## 2 N
## 3 N
## 4 N
## 5 N
## 6 N
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.err
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.err.abs
## 1 0.8831787
## 2 0.8763099
## 3 0.8749287
## 4 0.8738235
## 5 0.8727384
## 6 0.8718929
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.is.acc
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.accurate
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.error
## 1 -0.3831787
## 2 -0.3763099
## 3 -0.3749287
## 4 -0.3738235
## 5 -0.3727384
## 6 -0.3718929
## ComplaintID CDisputed.fctr
## 197 185851 Y
## 476 1413744 Y
## 1117 955822 Y
## 1304 1403490 Y
## 1504 1510933 Y
## 1606 508440 Y
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.prob
## 197 0.1822330
## 476 0.1899589
## 1117 0.2089868
## 1304 0.2415047
## 1504 0.2867303
## 1606 0.3391767
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet
## 197 N
## 476 N
## 1117 N
## 1304 N
## 1504 N
## 1606 N
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.err
## 197 TRUE
## 476 TRUE
## 1117 TRUE
## 1304 TRUE
## 1504 TRUE
## 1606 TRUE
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.err.abs
## 197 0.8177670
## 476 0.8100411
## 1117 0.7910132
## 1304 0.7584953
## 1504 0.7132697
## 1606 0.6608233
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.is.acc
## 197 FALSE
## 476 FALSE
## 1117 FALSE
## 1304 FALSE
## 1504 FALSE
## 1606 FALSE
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.accurate
## 197 FALSE
## 476 FALSE
## 1117 FALSE
## 1304 FALSE
## 1504 FALSE
## 1606 FALSE
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.error
## 197 -0.3177670
## 476 -0.3100411
## 1117 -0.2910132
## 1304 -0.2584953
## 1504 -0.2132697
## 1606 -0.1608233
## ComplaintID CDisputed.fctr
## 1767 2038500 N
## 1768 1936442 N
## 1769 1955185 N
## 1770 1965699 N
## 1771 1958907 N
## 1772 1975566 N
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.prob
## 1767 0.5304951
## 1768 0.5342601
## 1769 0.5355377
## 1770 0.5463535
## 1771 0.5474939
## 1772 0.5529280
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet
## 1767 Y
## 1768 Y
## 1769 Y
## 1770 Y
## 1771 Y
## 1772 Y
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.err
## 1767 TRUE
## 1768 TRUE
## 1769 TRUE
## 1770 TRUE
## 1771 TRUE
## 1772 TRUE
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.err.abs
## 1767 0.5304951
## 1768 0.5342601
## 1769 0.5355377
## 1770 0.5463535
## 1771 0.5474939
## 1772 0.5529280
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.is.acc
## 1767 FALSE
## 1768 FALSE
## 1769 FALSE
## 1770 FALSE
## 1771 FALSE
## 1772 FALSE
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.accurate
## 1767 FALSE
## 1768 FALSE
## 1769 FALSE
## 1770 FALSE
## 1771 FALSE
## 1772 FALSE
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.error
## 1767 0.03049512
## 1768 0.03426011
## 1769 0.03553769
## 1770 0.04635346
## 1771 0.04749386
## 1772 0.05292801
## .category .n.OOB .n.Fit .n.Tst .freqRatio.Fit .freqRatio.OOB
## .dummy .dummy 8548 34192 3229 1 1
## .freqRatio.Tst err.abs.fit.sum err.abs.fit.mean .n.fit
## .dummy 1 11473.35 0.3355567 34192
## err.abs.OOB.sum err.abs.OOB.mean
## .dummy 2868.779 0.3356082
## .n.OOB .n.Fit .n.Tst .freqRatio.Fit
## 8.548000e+03 3.419200e+04 3.229000e+03 1.000000e+00
## .freqRatio.OOB .freqRatio.Tst err.abs.fit.sum err.abs.fit.mean
## 1.000000e+00 1.000000e+00 1.147335e+04 3.355567e-01
## .n.fit err.abs.OOB.sum err.abs.OOB.mean
## 3.419200e+04 2.868779e+03 3.356082e-01
## label step_major step_minor label_minor bgn end elapsed
## 1 fit.models_2_bgn 1 0 teardown 634.994 NA NA
## label step_major step_minor label_minor bgn end elapsed
## 4 fit.models 2 2 2 619.227 635.005 15.778
## 5 fit.models 2 3 3 635.005 NA NA
# if (sum(is.na(glbObsAll$D.P.http)) > 0)
# stop("fit.models_3: Why is this happening ?")
#stop(here"); glb2Sav()
sync_glb_obs_df <- function() {
# Merge or cbind ?
for (col in setdiff(names(glbObsFit), names(glbObsTrn)))
glbObsTrn[glbObsTrn$.lcn == "Fit", col] <<- glbObsFit[, col]
for (col in setdiff(names(glbObsFit), names(glbObsAll)))
glbObsAll[glbObsAll$.lcn == "Fit", col] <<- glbObsFit[, col]
if (all(is.na(glbObsNew[, glb_rsp_var])))
for (col in setdiff(names(glbObsOOB), names(glbObsTrn)))
glbObsTrn[glbObsTrn$.lcn == "OOB", col] <<- glbObsOOB[, col]
for (col in setdiff(names(glbObsOOB), names(glbObsAll)))
glbObsAll[glbObsAll$.lcn == "OOB", col] <<- glbObsOOB[, col]
}
sync_glb_obs_df()
print(setdiff(names(glbObsNew), names(glbObsAll)))
## character(0)
replay.petrisim(pn = glb_analytics_pn,
replay.trans = (glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"model.selected")), flip_coord = TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: model.selected
## 1.0000 3 2 1 0 0
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc = TRUE)
## label step_major step_minor label_minor bgn end
## 5 fit.models 2 3 3 635.005 640.973
## 6 fit.data.training 3 0 0 640.973 NA
## elapsed
## 5 5.968
## 6 NA
#myloadChunk(glbChunks$inpFilePathName, keepSpec = c(NULL), dropSpec = c(NULL)); glb2Sav()
3.0: fit data training## [1] "myfit_mdl: enter: 0.000000 secs"
## [1] "myfit_mdl: fitting model: Trn.Interact.High.cor.Y###glmnet"
## [1] " indepVar: Datereceived.last16.log1p,Datereceived.last8.log1p,Datereceived.last16.log1p:Datereceived.last16.log1p,Datereceived.last16.log1p:Datereceived.last8.log1p,Datereceived.last16.log1p:Datereceived.last4.log1p,Datereceived.last16.log1p:Datereceived.last32.log1p,Datereceived.last16.log1p:Datereceived.last2.log1p,Datereceived.last16.log1p:ComplaintID,Datereceived.last16.log1p:Datereceived.year.fctr,Datereceived.last16.log1p:Datereceived.juliandate,Datereceived.last16.log1p:Datereceived.month.fctr,Datereceived.last16.log1p:Datereceived.wkend,Datereceived.last16.log1p:Sent.Recd.Dys.log1p,Datereceived.last16.log1p:Datereceived.date.fctr,Datereceived.last16.log1p:Datereceived.wkday.fctr"
## [1] "myfit_mdl: setup complete: 0.726000 secs"
## Fitting alpha = 0.55, lambda = 0.00029 on full training set
## [1] "myfit_mdl: train complete: 3.652000 secs"
## alpha lambda
## 1 0.55 0.0002901642
## Length Class Mode
## a0 61 -none- numeric
## beta 2135 dgCMatrix S4
## df 61 -none- numeric
## dim 2 -none- numeric
## lambda 61 -none- numeric
## dev.ratio 61 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 35 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -1.449132e+00
## Datereceived.last16.log1p
## -2.633712e-03
## Datereceived.last8.log1p
## 1.908868e-02
## Datereceived.last16.log1p:ComplaintID
## 1.729734e-08
## Datereceived.last16.log1p:Datereceived.date.fctr(7,13]
## 1.170171e-03
## Datereceived.last16.log1p:Datereceived.date.fctr(13,19]
## 1.014663e-03
## Datereceived.last16.log1p:Datereceived.date.fctr(19,25]
## 3.804900e-03
## Datereceived.last16.log1p:Datereceived.date.fctr(25,31]
## 6.368964e-03
## Datereceived.last16.log1p:Datereceived.juliandate
## 1.070163e-04
## Datereceived.last16.log1p:Datereceived.last2.log1p
## 3.750284e-04
## Datereceived.last16.log1p:Datereceived.last32.log1p
## -2.715102e-03
## Datereceived.last16.log1p:Datereceived.last4.log1p
## 3.403461e-04
## Datereceived.last16.log1p:Datereceived.month.fctr02
## -7.485879e-03
## Datereceived.last16.log1p:Datereceived.month.fctr05
## 1.656305e-02
## Datereceived.last16.log1p:Datereceived.month.fctr06
## 2.110048e-02
## Datereceived.last16.log1p:Datereceived.month.fctr07
## 2.774896e-02
## Datereceived.last16.log1p:Datereceived.month.fctr08
## 4.367237e-02
## Datereceived.last16.log1p:Datereceived.month.fctr09
## 3.438484e-02
## Datereceived.last16.log1p:Datereceived.month.fctr10
## 5.485943e-03
## Datereceived.last16.log1p:Datereceived.month.fctr11
## -6.950744e-03
## Datereceived.last16.log1p:Datereceived.month.fctr12
## -1.960298e-02
## Datereceived.last16.log1p:Datereceived.wkday.fctr2
## 1.251253e-02
## Datereceived.last16.log1p:Datereceived.wkday.fctr3
## 1.211954e-03
## Datereceived.last16.log1p:Datereceived.wkday.fctr4
## -2.794199e-03
## Datereceived.last16.log1p:Datereceived.wkday.fctr5
## -3.069316e-03
## Datereceived.last16.log1p:Datereceived.wkday.fctr6
## 3.562386e-03
## Datereceived.last16.log1p:Datereceived.wkend
## -1.539183e-02
## Datereceived.last16.log1p:Datereceived.year.fctr2014
## -1.542515e-02
## Datereceived.last16.log1p:Datereceived.year.fctr2015
## -1.498039e-02
## Datereceived.last16.log1p:Datereceived.year.fctr2016
## 6.787761e-02
## Datereceived.last16.log1p:Sent.Recd.Dys.log1p
## -4.985695e-04
## [1] "max lambda < lambdaOpt:"
## [1] "Feats mismatch between coefs_left & rght:"
## [1] "(Intercept)"
## [2] "Datereceived.last16.log1p"
## [3] "Datereceived.last8.log1p"
## [4] "Datereceived.last16.log1p:ComplaintID"
## [5] "Datereceived.last16.log1p:Datereceived.date.fctr(7,13]"
## [6] "Datereceived.last16.log1p:Datereceived.date.fctr(13,19]"
## [7] "Datereceived.last16.log1p:Datereceived.date.fctr(19,25]"
## [8] "Datereceived.last16.log1p:Datereceived.date.fctr(25,31]"
## [9] "Datereceived.last16.log1p:Datereceived.juliandate"
## [10] "Datereceived.last16.log1p:Datereceived.last2.log1p"
## [11] "Datereceived.last16.log1p:Datereceived.last32.log1p"
## [12] "Datereceived.last16.log1p:Datereceived.last4.log1p"
## [13] "Datereceived.last16.log1p:Datereceived.last8.log1p"
## [14] "Datereceived.last16.log1p:Datereceived.month.fctr02"
## [15] "Datereceived.last16.log1p:Datereceived.month.fctr03"
## [16] "Datereceived.last16.log1p:Datereceived.month.fctr04"
## [17] "Datereceived.last16.log1p:Datereceived.month.fctr05"
## [18] "Datereceived.last16.log1p:Datereceived.month.fctr06"
## [19] "Datereceived.last16.log1p:Datereceived.month.fctr07"
## [20] "Datereceived.last16.log1p:Datereceived.month.fctr08"
## [21] "Datereceived.last16.log1p:Datereceived.month.fctr09"
## [22] "Datereceived.last16.log1p:Datereceived.month.fctr10"
## [23] "Datereceived.last16.log1p:Datereceived.month.fctr11"
## [24] "Datereceived.last16.log1p:Datereceived.month.fctr12"
## [25] "Datereceived.last16.log1p:Datereceived.wkday.fctr1"
## [26] "Datereceived.last16.log1p:Datereceived.wkday.fctr2"
## [27] "Datereceived.last16.log1p:Datereceived.wkday.fctr3"
## [28] "Datereceived.last16.log1p:Datereceived.wkday.fctr4"
## [29] "Datereceived.last16.log1p:Datereceived.wkday.fctr5"
## [30] "Datereceived.last16.log1p:Datereceived.wkday.fctr6"
## [31] "Datereceived.last16.log1p:Datereceived.wkend"
## [32] "Datereceived.last16.log1p:Datereceived.year.fctr2013"
## [33] "Datereceived.last16.log1p:Datereceived.year.fctr2014"
## [34] "Datereceived.last16.log1p:Datereceived.year.fctr2015"
## [35] "Datereceived.last16.log1p:Datereceived.year.fctr2016"
## [36] "Datereceived.last16.log1p:Sent.Recd.Dys.log1p"
## [1] "myfit_mdl: train diagnostics complete: 3.676000 secs"
## Prediction
## Reference N Y
## N 33005 90
## Y 8773 872
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.926299e-01 1.287566e-01 7.887547e-01 7.964652e-01 7.743332e-01
## AccuracyPValue McnemarPValue
## 3.357775e-20 0.000000e+00
## [1] "myfit_mdl: predict complete: 1983.093000 secs"
## id
## 1 Trn.Interact.High.cor.Y###glmnet
## feats
## 1 Datereceived.last16.log1p,Datereceived.last8.log1p,Datereceived.last16.log1p:Datereceived.last16.log1p,Datereceived.last16.log1p:Datereceived.last8.log1p,Datereceived.last16.log1p:Datereceived.last4.log1p,Datereceived.last16.log1p:Datereceived.last32.log1p,Datereceived.last16.log1p:Datereceived.last2.log1p,Datereceived.last16.log1p:ComplaintID,Datereceived.last16.log1p:Datereceived.year.fctr,Datereceived.last16.log1p:Datereceived.juliandate,Datereceived.last16.log1p:Datereceived.month.fctr,Datereceived.last16.log1p:Datereceived.wkend,Datereceived.last16.log1p:Sent.Recd.Dys.log1p,Datereceived.last16.log1p:Datereceived.date.fctr,Datereceived.last16.log1p:Datereceived.wkday.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 0 2.846 1.936
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.543845 0.9972806 0.09040954 0.5742735
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.1644197 0.7926299
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7887547 0.7964652 0.1287566
## min.log.loss.mean.fit
## 1 0.517101
## [1] "myfit_mdl: exit: 1983.107000 secs"
## [1] "myfit_mdl: enter: 0.000000 secs"
## [1] "myfit_mdl: fitting model: Trn.Interact.High.cor.Y##rcv#glmnet"
## [1] " indepVar: Datereceived.last16.log1p,Datereceived.last8.log1p,Datereceived.last16.log1p:Datereceived.last16.log1p,Datereceived.last16.log1p:Datereceived.last8.log1p,Datereceived.last16.log1p:Datereceived.last4.log1p,Datereceived.last16.log1p:Datereceived.last32.log1p,Datereceived.last16.log1p:Datereceived.last2.log1p,Datereceived.last16.log1p:ComplaintID,Datereceived.last16.log1p:Datereceived.year.fctr,Datereceived.last16.log1p:Datereceived.juliandate,Datereceived.last16.log1p:Datereceived.month.fctr,Datereceived.last16.log1p:Datereceived.wkend,Datereceived.last16.log1p:Sent.Recd.Dys.log1p,Datereceived.last16.log1p:Datereceived.date.fctr,Datereceived.last16.log1p:Datereceived.wkday.fctr"
## [1] "myfit_mdl: setup complete: 0.701000 secs"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.55, lambda = 0.000288 on full training set
## [1] "myfit_mdl: train complete: 80.379000 secs"
## Length Class Mode
## a0 61 -none- numeric
## beta 2135 dgCMatrix S4
## df 61 -none- numeric
## dim 2 -none- numeric
## lambda 61 -none- numeric
## dev.ratio 61 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 35 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -1.449132e+00
## Datereceived.last16.log1p
## -2.633712e-03
## Datereceived.last8.log1p
## 1.908868e-02
## Datereceived.last16.log1p:ComplaintID
## 1.729734e-08
## Datereceived.last16.log1p:Datereceived.date.fctr(7,13]
## 1.170171e-03
## Datereceived.last16.log1p:Datereceived.date.fctr(13,19]
## 1.014663e-03
## Datereceived.last16.log1p:Datereceived.date.fctr(19,25]
## 3.804900e-03
## Datereceived.last16.log1p:Datereceived.date.fctr(25,31]
## 6.368964e-03
## Datereceived.last16.log1p:Datereceived.juliandate
## 1.070163e-04
## Datereceived.last16.log1p:Datereceived.last2.log1p
## 3.750284e-04
## Datereceived.last16.log1p:Datereceived.last32.log1p
## -2.715102e-03
## Datereceived.last16.log1p:Datereceived.last4.log1p
## 3.403461e-04
## Datereceived.last16.log1p:Datereceived.month.fctr02
## -7.485879e-03
## Datereceived.last16.log1p:Datereceived.month.fctr05
## 1.656305e-02
## Datereceived.last16.log1p:Datereceived.month.fctr06
## 2.110048e-02
## Datereceived.last16.log1p:Datereceived.month.fctr07
## 2.774896e-02
## Datereceived.last16.log1p:Datereceived.month.fctr08
## 4.367237e-02
## Datereceived.last16.log1p:Datereceived.month.fctr09
## 3.438484e-02
## Datereceived.last16.log1p:Datereceived.month.fctr10
## 5.485943e-03
## Datereceived.last16.log1p:Datereceived.month.fctr11
## -6.950744e-03
## Datereceived.last16.log1p:Datereceived.month.fctr12
## -1.960298e-02
## Datereceived.last16.log1p:Datereceived.wkday.fctr2
## 1.251253e-02
## Datereceived.last16.log1p:Datereceived.wkday.fctr3
## 1.211954e-03
## Datereceived.last16.log1p:Datereceived.wkday.fctr4
## -2.794199e-03
## Datereceived.last16.log1p:Datereceived.wkday.fctr5
## -3.069316e-03
## Datereceived.last16.log1p:Datereceived.wkday.fctr6
## 3.562386e-03
## Datereceived.last16.log1p:Datereceived.wkend
## -1.539183e-02
## Datereceived.last16.log1p:Datereceived.year.fctr2014
## -1.542515e-02
## Datereceived.last16.log1p:Datereceived.year.fctr2015
## -1.498039e-02
## Datereceived.last16.log1p:Datereceived.year.fctr2016
## 6.787761e-02
## Datereceived.last16.log1p:Sent.Recd.Dys.log1p
## -4.985695e-04
## [1] "max lambda < lambdaOpt:"
## [1] "Feats mismatch between coefs_left & rght:"
## [1] "(Intercept)"
## [2] "Datereceived.last16.log1p"
## [3] "Datereceived.last8.log1p"
## [4] "Datereceived.last16.log1p:ComplaintID"
## [5] "Datereceived.last16.log1p:Datereceived.date.fctr(7,13]"
## [6] "Datereceived.last16.log1p:Datereceived.date.fctr(13,19]"
## [7] "Datereceived.last16.log1p:Datereceived.date.fctr(19,25]"
## [8] "Datereceived.last16.log1p:Datereceived.date.fctr(25,31]"
## [9] "Datereceived.last16.log1p:Datereceived.juliandate"
## [10] "Datereceived.last16.log1p:Datereceived.last2.log1p"
## [11] "Datereceived.last16.log1p:Datereceived.last32.log1p"
## [12] "Datereceived.last16.log1p:Datereceived.last4.log1p"
## [13] "Datereceived.last16.log1p:Datereceived.last8.log1p"
## [14] "Datereceived.last16.log1p:Datereceived.month.fctr02"
## [15] "Datereceived.last16.log1p:Datereceived.month.fctr03"
## [16] "Datereceived.last16.log1p:Datereceived.month.fctr04"
## [17] "Datereceived.last16.log1p:Datereceived.month.fctr05"
## [18] "Datereceived.last16.log1p:Datereceived.month.fctr06"
## [19] "Datereceived.last16.log1p:Datereceived.month.fctr07"
## [20] "Datereceived.last16.log1p:Datereceived.month.fctr08"
## [21] "Datereceived.last16.log1p:Datereceived.month.fctr09"
## [22] "Datereceived.last16.log1p:Datereceived.month.fctr10"
## [23] "Datereceived.last16.log1p:Datereceived.month.fctr11"
## [24] "Datereceived.last16.log1p:Datereceived.month.fctr12"
## [25] "Datereceived.last16.log1p:Datereceived.wkday.fctr1"
## [26] "Datereceived.last16.log1p:Datereceived.wkday.fctr2"
## [27] "Datereceived.last16.log1p:Datereceived.wkday.fctr3"
## [28] "Datereceived.last16.log1p:Datereceived.wkday.fctr4"
## [29] "Datereceived.last16.log1p:Datereceived.wkday.fctr5"
## [30] "Datereceived.last16.log1p:Datereceived.wkday.fctr6"
## [31] "Datereceived.last16.log1p:Datereceived.wkend"
## [32] "Datereceived.last16.log1p:Datereceived.year.fctr2013"
## [33] "Datereceived.last16.log1p:Datereceived.year.fctr2014"
## [34] "Datereceived.last16.log1p:Datereceived.year.fctr2015"
## [35] "Datereceived.last16.log1p:Datereceived.year.fctr2016"
## [36] "Datereceived.last16.log1p:Sent.Recd.Dys.log1p"
## [1] "myfit_mdl: train diagnostics complete: 80.949000 secs"
## Prediction
## Reference N Y
## N 33005 90
## Y 8773 872
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.926299e-01 1.287566e-01 7.887547e-01 7.964652e-01 7.743332e-01
## AccuracyPValue McnemarPValue
## 3.357775e-20 0.000000e+00
## [1] "myfit_mdl: predict complete: 101.269000 secs"
## id
## 1 Trn.Interact.High.cor.Y##rcv#glmnet
## feats
## 1 Datereceived.last16.log1p,Datereceived.last8.log1p,Datereceived.last16.log1p:Datereceived.last16.log1p,Datereceived.last16.log1p:Datereceived.last8.log1p,Datereceived.last16.log1p:Datereceived.last4.log1p,Datereceived.last16.log1p:Datereceived.last32.log1p,Datereceived.last16.log1p:Datereceived.last2.log1p,Datereceived.last16.log1p:ComplaintID,Datereceived.last16.log1p:Datereceived.year.fctr,Datereceived.last16.log1p:Datereceived.juliandate,Datereceived.last16.log1p:Datereceived.month.fctr,Datereceived.last16.log1p:Datereceived.wkend,Datereceived.last16.log1p:Sent.Recd.Dys.log1p,Datereceived.last16.log1p:Datereceived.date.fctr,Datereceived.last16.log1p:Datereceived.wkday.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 79.604 1.955
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.543845 0.9972806 0.09040954 0.5742735
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.1644197 0.7920995
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7887547 0.7964652 0.12604
## min.log.loss.mean.fit max.AccuracySD.fit max.KappaSD.fit
## 1 0.517101 0.001222252 0.006979897
## [1] "myfit_mdl: exit: 101.284000 secs"
## label step_major step_minor label_minor bgn end
## 6 fit.data.training 3 0 0 640.973 2725.819
## 7 fit.data.training 3 1 1 2725.819 NA
## elapsed
## 6 2084.846
## 7 NA
#stop(here"); glb2Sav()
if (glb_is_classification && glb_is_binomial)
prob_threshold <- glb_models_df[glb_models_df$id == glbMdlSltId,
"opt.prob.threshold.OOB"] else
prob_threshold <- NULL
if (grepl("Ensemble", glbMdlFnlNslId)) {
# Get predictions for each model in ensemble; Outliers that have been moved to OOB might not have been predicted yet
mdlEnsembleComps <- unlist(str_split(subset(glb_models_df,
id == glbMdlFnlNslId)$feats, ","))
if (glb_is_classification)
# mdlEnsembleComps <- gsub("\\.prob$", "", mdlEnsembleComps)
# mdlEnsembleComps <- gsub(paste0("^",
# gsub(".", "\\.", mygetPredictIds(glb_rsp_var)$value, fixed = TRUE)),
# "", mdlEnsembleComps)
mdlEnsembleComps <- glb_models_df$id[sapply(glb_models_df$id, function(thsMdlId)
mygetPredictIds(glb_rsp_var, thsMdlId)$prob %in% mdlEnsembleComps)] else
mdlEnsembleComps <- glb_models_df$id[sapply(glb_models_df$id, function(thsMdlId)
mygetPredictIds(glb_rsp_var, thsMdlId)$value %in% mdlEnsembleComps)]
for (mdl_id in mdlEnsembleComps) {
glbObsTrn <- glb_get_predictions(df = glbObsTrn, mdl_id = mdl_id,
rsp_var = glb_rsp_var,
prob_threshold_def = prob_threshold)
glbObsNew <- glb_get_predictions(df = glbObsNew, mdl_id = mdl_id,
rsp_var = glb_rsp_var,
prob_threshold_def = prob_threshold)
# glbMdlFnlNsl uses the same coefficients as glbMdlSlt,
# so copy the "Final" columns into "non-Final" columns
glbObsTrn[, gsub("Trn.", "", unlist(mygetPredictIds(glb_rsp_var, mdl_id)))] <-
glbObsTrn[, unlist(mygetPredictIds(glb_rsp_var, mdl_id))]
glbObsNew[, gsub("Trn.", "", unlist(mygetPredictIds(glb_rsp_var, mdl_id)))] <-
glbObsNew[, unlist(mygetPredictIds(glb_rsp_var, mdl_id))]
}
}
glbObsTrn <- glb_get_predictions(df = glbObsTrn, mdl_id = glbMdlFnlNslId,
rsp_var = glb_rsp_var,
prob_threshold_def = prob_threshold)
## Warning in glb_get_predictions(df = glbObsTrn, mdl_id = glbMdlFnlNslId, :
## Using default probability threshold: 0.5
glb_featsimp_df <- myget_feats_importance(mdl=glbMdlFnlNsl,
featsimp_df=glb_featsimp_df)
#glb_featsimp_df[, paste0(glbMdlFnlNslId, ".imp")] <- glb_featsimp_df$imp
print(glb_featsimp_df)
## Interact.High.cor.Y..rcv.glmnet.imp
## Datereceived.last16.log1p:Datereceived.year.fctr2016 1.000000e+02
## Datereceived.last16.log1p:Datereceived.month.fctr08 7.069522e+01
## Datereceived.last16.log1p:Datereceived.month.fctr09 5.416382e+01
## Datereceived.last16.log1p:Datereceived.month.fctr07 4.644842e+01
## Datereceived.last16.log1p:Datereceived.month.fctr06 3.383224e+01
## Datereceived.last16.log1p:Datereceived.month.fctr12 2.700819e+01
## Datereceived.last8.log1p 2.918189e+01
## Datereceived.last16.log1p:Datereceived.month.fctr05 2.795287e+01
## Datereceived.last16.log1p:Datereceived.year.fctr2014 2.885588e+01
## Datereceived.last16.log1p:Datereceived.wkend 2.242858e+01
## Datereceived.last16.log1p:Datereceived.year.fctr2015 3.129653e+01
## Datereceived.last16.log1p:Datereceived.wkday.fctr2 2.133078e+01
## Datereceived.last16.log1p:Datereceived.month.fctr02 1.720715e+01
## Datereceived.last16.log1p:Datereceived.month.fctr11 4.863029e+00
## Datereceived.last16.log1p:Datereceived.date.fctr(25,31] 1.629172e+01
## Datereceived.last16.log1p:Datereceived.month.fctr10 9.643381e+00
## Datereceived.last16.log1p:Datereceived.date.fctr(19,25] 1.006684e+01
## Datereceived.last16.log1p:Datereceived.wkday.fctr6 1.455873e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr5 4.346394e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr4 6.297452e+00
## Datereceived.last16.log1p:Datereceived.last32.log1p 4.376526e+00
## Datereceived.last16.log1p 2.011325e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr3 0.000000e+00
## Datereceived.last16.log1p:Datereceived.date.fctr(7,13] 7.828063e+00
## Datereceived.last16.log1p:Datereceived.date.fctr(13,19] 6.130789e+00
## Datereceived.last16.log1p:Sent.Recd.Dys.log1p 3.244713e+00
## Datereceived.last16.log1p:Datereceived.last2.log1p 7.886884e-01
## Datereceived.last16.log1p:Datereceived.last4.log1p 5.938799e-01
## Datereceived.last16.log1p:Datereceived.juliandate 1.453330e-01
## Datereceived.last16.log1p:ComplaintID 3.378702e-05
## Datereceived.last16.log1p:Datereceived.last8.log1p 0.000000e+00
## Datereceived.last16.log1p:Datereceived.month.fctr03 3.711741e+00
## Datereceived.last16.log1p:Datereceived.month.fctr04 0.000000e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr1 3.065406e+00
## Datereceived.last16.log1p:Datereceived.year.fctr2013 5.068562e-01
## Trn.Interact.High.cor.Y...glmnet.imp
## Datereceived.last16.log1p:Datereceived.year.fctr2016 1.000000e+02
## Datereceived.last16.log1p:Datereceived.month.fctr08 6.433988e+01
## Datereceived.last16.log1p:Datereceived.month.fctr09 5.065711e+01
## Datereceived.last16.log1p:Datereceived.month.fctr07 4.088087e+01
## Datereceived.last16.log1p:Datereceived.month.fctr06 3.108606e+01
## Datereceived.last16.log1p:Datereceived.month.fctr12 2.887989e+01
## Datereceived.last8.log1p 2.812220e+01
## Datereceived.last16.log1p:Datereceived.month.fctr05 2.440135e+01
## Datereceived.last16.log1p:Datereceived.year.fctr2014 2.272494e+01
## Datereceived.last16.log1p:Datereceived.wkend 2.267585e+01
## Datereceived.last16.log1p:Datereceived.year.fctr2015 2.206971e+01
## Datereceived.last16.log1p:Datereceived.wkday.fctr2 1.843396e+01
## Datereceived.last16.log1p:Datereceived.month.fctr02 1.102849e+01
## Datereceived.last16.log1p:Datereceived.month.fctr11 1.024011e+01
## Datereceived.last16.log1p:Datereceived.date.fctr(25,31] 9.383011e+00
## Datereceived.last16.log1p:Datereceived.month.fctr10 8.082110e+00
## Datereceived.last16.log1p:Datereceived.date.fctr(19,25] 5.605530e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr6 5.248248e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr5 4.521839e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr4 4.116525e+00
## Datereceived.last16.log1p:Datereceived.last32.log1p 3.999996e+00
## Datereceived.last16.log1p 3.880089e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr3 1.785499e+00
## Datereceived.last16.log1p:Datereceived.date.fctr(7,13] 1.723943e+00
## Datereceived.last16.log1p:Datereceived.date.fctr(13,19] 1.494842e+00
## Datereceived.last16.log1p:Sent.Recd.Dys.log1p 7.345125e-01
## Datereceived.last16.log1p:Datereceived.last2.log1p 5.525067e-01
## Datereceived.last16.log1p:Datereceived.last4.log1p 5.014115e-01
## Datereceived.last16.log1p:Datereceived.juliandate 1.576607e-01
## Datereceived.last16.log1p:ComplaintID 2.548313e-05
## Datereceived.last16.log1p:Datereceived.last8.log1p 0.000000e+00
## Datereceived.last16.log1p:Datereceived.month.fctr03 0.000000e+00
## Datereceived.last16.log1p:Datereceived.month.fctr04 0.000000e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr1 0.000000e+00
## Datereceived.last16.log1p:Datereceived.year.fctr2013 0.000000e+00
## imp
## Datereceived.last16.log1p:Datereceived.year.fctr2016 1.000000e+02
## Datereceived.last16.log1p:Datereceived.month.fctr08 6.433988e+01
## Datereceived.last16.log1p:Datereceived.month.fctr09 5.065711e+01
## Datereceived.last16.log1p:Datereceived.month.fctr07 4.088087e+01
## Datereceived.last16.log1p:Datereceived.month.fctr06 3.108606e+01
## Datereceived.last16.log1p:Datereceived.month.fctr12 2.887989e+01
## Datereceived.last8.log1p 2.812220e+01
## Datereceived.last16.log1p:Datereceived.month.fctr05 2.440135e+01
## Datereceived.last16.log1p:Datereceived.year.fctr2014 2.272494e+01
## Datereceived.last16.log1p:Datereceived.wkend 2.267585e+01
## Datereceived.last16.log1p:Datereceived.year.fctr2015 2.206971e+01
## Datereceived.last16.log1p:Datereceived.wkday.fctr2 1.843396e+01
## Datereceived.last16.log1p:Datereceived.month.fctr02 1.102849e+01
## Datereceived.last16.log1p:Datereceived.month.fctr11 1.024011e+01
## Datereceived.last16.log1p:Datereceived.date.fctr(25,31] 9.383011e+00
## Datereceived.last16.log1p:Datereceived.month.fctr10 8.082110e+00
## Datereceived.last16.log1p:Datereceived.date.fctr(19,25] 5.605530e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr6 5.248248e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr5 4.521839e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr4 4.116525e+00
## Datereceived.last16.log1p:Datereceived.last32.log1p 3.999996e+00
## Datereceived.last16.log1p 3.880089e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr3 1.785499e+00
## Datereceived.last16.log1p:Datereceived.date.fctr(7,13] 1.723943e+00
## Datereceived.last16.log1p:Datereceived.date.fctr(13,19] 1.494842e+00
## Datereceived.last16.log1p:Sent.Recd.Dys.log1p 7.345125e-01
## Datereceived.last16.log1p:Datereceived.last2.log1p 5.525067e-01
## Datereceived.last16.log1p:Datereceived.last4.log1p 5.014115e-01
## Datereceived.last16.log1p:Datereceived.juliandate 1.576607e-01
## Datereceived.last16.log1p:ComplaintID 2.548313e-05
## Datereceived.last16.log1p:Datereceived.last8.log1p 0.000000e+00
## Datereceived.last16.log1p:Datereceived.month.fctr03 0.000000e+00
## Datereceived.last16.log1p:Datereceived.month.fctr04 0.000000e+00
## Datereceived.last16.log1p:Datereceived.wkday.fctr1 0.000000e+00
## Datereceived.last16.log1p:Datereceived.year.fctr2013 0.000000e+00
if (glb_is_classification && glb_is_binomial)
glb_analytics_diag_plots(obs_df=glbObsTrn, mdl_id=glbMdlFnlNslId,
prob_threshold=glb_models_df[glb_models_df$id == glbMdlSltId,
"opt.prob.threshold.OOB"]) else
glb_analytics_diag_plots(obs_df=glbObsTrn, mdl_id=glbMdlFnlNslId)
## Warning in glb_analytics_diag_plots(obs_df = glbObsTrn, mdl_id =
## glbMdlFnlNslId, : Limiting important feature scatter plots to 5 out of 14
## [1] "Min/Max Boundaries: "
## ComplaintID CDisputed.fctr
## 1 1307759 Y
## 2 1463353 Y
## 3 1476619 Y
## 4 1468969 Y
## 5 1484146 Y
## 6 1469003 Y
## 7 810647 N
## 8 146947 N
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.prob
## 1 0.1899589
## 2 0.1899589
## 3 0.1899589
## 4 0.1899589
## 5 0.1899589
## 6 0.1899589
## 7 0.1899589
## 8 0.2882603
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet
## 1 N
## 2 N
## 3 N
## 4 N
## 5 N
## 6 N
## 7 N
## 8 N
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.err
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
## 7 FALSE
## 8 FALSE
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.err.abs
## 1 0.8100411
## 2 0.8100411
## 3 0.8100411
## 4 0.8100411
## 5 0.8100411
## 6 0.8100411
## 7 0.1899589
## 8 0.2882603
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.is.acc
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## 7 TRUE
## 8 TRUE
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.prob
## 1 0.1901352
## 2 0.1901352
## 3 0.1901352
## 4 0.1901352
## 5 0.1901352
## 6 0.1901352
## 7 0.1901352
## 8 0.3096666
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet
## 1 N
## 2 N
## 3 N
## 4 N
## 5 N
## 6 N
## 7 N
## 8 N
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.err
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
## 7 FALSE
## 8 FALSE
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.err.abs
## 1 0.8098648
## 2 0.8098648
## 3 0.8098648
## 4 0.8098648
## 5 0.8098648
## 6 0.8098648
## 7 0.1901352
## 8 0.3096666
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.is.acc
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## 7 TRUE
## 8 TRUE
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.accurate
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## 7 TRUE
## 8 TRUE
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.error .label
## 1 -0.3098648 1307759
## 2 -0.3098648 1463353
## 3 -0.3098648 1476619
## 4 -0.3098648 1468969
## 5 -0.3098648 1484146
## 6 -0.3098648 1469003
## 7 0.0000000 810647
## 8 0.0000000 146947
## [1] "Inaccurate: "
## ComplaintID CDisputed.fctr
## 1 727006 Y
## 2 727039 Y
## 3 716709 Y
## 4 739040 Y
## 5 676047 Y
## 6 1241291 Y
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.prob
## 1 0.1126924
## 2 0.1155862
## 3 0.1108017
## 4 0.1229988
## 5 0.1205261
## 6 0.1189478
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet
## 1 N
## 2 N
## 3 N
## 4 N
## 5 N
## 6 N
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.err
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.err.abs
## 1 0.8873076
## 2 0.8844138
## 3 0.8891983
## 4 0.8770012
## 5 0.8794739
## 6 0.8810522
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.is.acc
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.prob
## 1 0.1144784
## 2 0.1152008
## 3 0.1160446
## 4 0.1202678
## 5 0.1208567
## 6 0.1209901
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet
## 1 N
## 2 N
## 3 N
## 4 N
## 5 N
## 6 N
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.err
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.err.abs
## 1 0.8855216
## 2 0.8847992
## 3 0.8839554
## 4 0.8797322
## 5 0.8791433
## 6 0.8790099
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.is.acc
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.accurate
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.error
## 1 -0.3855216
## 2 -0.3847992
## 3 -0.3839554
## 4 -0.3797322
## 5 -0.3791433
## 6 -0.3790099
## ComplaintID CDisputed.fctr
## 338 784204 Y
## 3236 779183 Y
## 3856 1423708 Y
## 6561 627725 Y
## 7314 150708 Y
## 7758 1507743 Y
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.prob
## 338 0.1614008
## 3236 0.1899589
## 3856 0.1899589
## 6561 0.2513175
## 7314 NA
## 7758 0.3018713
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet
## 338 N
## 3236 N
## 3856 N
## 6561 N
## 7314 <NA>
## 7758 N
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.err
## 338 TRUE
## 3236 TRUE
## 3856 TRUE
## 6561 TRUE
## 7314 NA
## 7758 TRUE
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.err.abs
## 338 0.8385992
## 3236 0.8100411
## 3856 0.8100411
## 6561 0.7486825
## 7314 NA
## 7758 0.6981287
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.is.acc
## 338 FALSE
## 3236 FALSE
## 3856 FALSE
## 6561 FALSE
## 7314 NA
## 7758 FALSE
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.prob
## 338 0.1529552
## 3236 0.1901352
## 3856 0.1901352
## 6561 0.2428487
## 7314 0.2789046
## 7758 0.3157572
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet
## 338 N
## 3236 N
## 3856 N
## 6561 N
## 7314 N
## 7758 N
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.err
## 338 TRUE
## 3236 TRUE
## 3856 TRUE
## 6561 TRUE
## 7314 TRUE
## 7758 TRUE
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.err.abs
## 338 0.8470448
## 3236 0.8098648
## 3856 0.8098648
## 6561 0.7571513
## 7314 0.7210954
## 7758 0.6842428
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.is.acc
## 338 FALSE
## 3236 FALSE
## 3856 FALSE
## 6561 FALSE
## 7314 FALSE
## 7758 FALSE
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.accurate
## 338 FALSE
## 3236 FALSE
## 3856 FALSE
## 6561 FALSE
## 7314 FALSE
## 7758 FALSE
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.error
## 338 -0.3470448
## 3236 -0.3098648
## 3856 -0.3098648
## 6561 -0.2571513
## 7314 -0.2210954
## 7758 -0.1842428
## ComplaintID CDisputed.fctr
## 8858 1965699 N
## 8859 1975566 N
## 8860 1958907 N
## 8861 2053920 N
## 8862 2052715 N
## 8863 2053171 N
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.prob
## 8858 NA
## 8859 NA
## 8860 NA
## 8861 0.5686014
## 8862 0.5724104
## 8863 0.5724369
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet
## 8858 <NA>
## 8859 <NA>
## 8860 <NA>
## 8861 Y
## 8862 Y
## 8863 Y
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.err
## 8858 NA
## 8859 NA
## 8860 NA
## 8861 TRUE
## 8862 TRUE
## 8863 TRUE
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.err.abs
## 8858 NA
## 8859 NA
## 8860 NA
## 8861 0.5686014
## 8862 0.5724104
## 8863 0.5724369
## CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.is.acc
## 8858 NA
## 8859 NA
## 8860 NA
## 8861 FALSE
## 8862 FALSE
## 8863 FALSE
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.prob
## 8858 0.5444872
## 8859 0.5458639
## 8860 0.5507895
## 8861 0.5645484
## 8862 0.5654556
## 8863 0.5654776
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet
## 8858 Y
## 8859 Y
## 8860 Y
## 8861 Y
## 8862 Y
## 8863 Y
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.err
## 8858 TRUE
## 8859 TRUE
## 8860 TRUE
## 8861 TRUE
## 8862 TRUE
## 8863 TRUE
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.err.abs
## 8858 0.5444872
## 8859 0.5458639
## 8860 0.5507895
## 8861 0.5645484
## 8862 0.5654556
## 8863 0.5654776
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.is.acc
## 8858 FALSE
## 8859 FALSE
## 8860 FALSE
## 8861 FALSE
## 8862 FALSE
## 8863 FALSE
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.accurate
## 8858 FALSE
## 8859 FALSE
## 8860 FALSE
## 8861 FALSE
## 8862 FALSE
## 8863 FALSE
## CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.error
## 8858 0.04448717
## 8859 0.04586392
## 8860 0.05078952
## 8861 0.06454839
## 8862 0.06545560
## 8863 0.06547763
dsp_feats_vctr <- c(NULL)
for(var in grep(".imp", names(glb_feats_df), fixed=TRUE, value=TRUE))
dsp_feats_vctr <- union(dsp_feats_vctr,
glb_feats_df[!is.na(glb_feats_df[, var]), "id"])
# print(glbObsTrn[glbObsTrn$UniqueID %in% FN_OOB_ids,
# grep(glb_rsp_var, names(glbObsTrn), value=TRUE)])
print(setdiff(names(glbObsTrn), names(glbObsAll)))
## [1] "CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.prob"
## [2] "CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet"
## [3] "CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.err"
## [4] "CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.err.abs"
## [5] "CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.is.acc"
for (col in setdiff(names(glbObsTrn), names(glbObsAll)))
# Merge or cbind ?
glbObsAll[glbObsAll$.src == "Train", col] <- glbObsTrn[, col]
print(setdiff(names(glbObsFit), names(glbObsAll)))
## character(0)
print(setdiff(names(glbObsOOB), names(glbObsAll)))
## character(0)
for (col in setdiff(names(glbObsOOB), names(glbObsAll)))
# Merge or cbind ?
glbObsAll[glbObsAll$.lcn == "OOB", col] <- glbObsOOB[, col]
print(setdiff(names(glbObsNew), names(glbObsAll)))
## character(0)
#glb2Sav(); all.equal(savObsAll, glbObsAll); all.equal(sav_models_lst, glb_models_lst)
#load(file = paste0(glbOut$pfx, "dsk_knitr.RData"))
#cmpCols <- names(glbObsAll)[!grepl("\\.Final\\.", names(glbObsAll))]; all.equal(savObsAll[, cmpCols], glbObsAll[, cmpCols]); all.equal(savObsAll[, "H.P.http"], glbObsAll[, "H.P.http"]);
replay.petrisim(pn = glb_analytics_pn,
replay.trans = (glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.training.all.prediction","model.final")), flip_coord = TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: model.selected
## 1.0000 3 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.training.all.prediction
## 2.0000 5 2 0 0 1
## Warning in replay.petrisim(pn = glb_analytics_pn, replay.trans =
## (glb_analytics_avl_objs <- c(glb_analytics_avl_objs, : Transition:
## model.final not enabled; adding missing token(s)
## Warning in replay.petrisim(pn = glb_analytics_pn, replay.trans
## = (glb_analytics_avl_objs <- c(glb_analytics_avl_objs, : Place:
## fit.data.training.all: added 1 missing token
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: model.final
## 3.0000 4 2 0 1 1
glb_chunks_df <- myadd_chunk(glb_chunks_df, "predict.data.new", major.inc = TRUE)
## label step_major step_minor label_minor bgn end
## 7 fit.data.training 3 1 1 2725.819 2753.943
## 8 predict.data.new 4 0 0 2753.944 NA
## elapsed
## 7 28.124
## 8 NA
4.0: predict data new## Warning in glb_get_predictions(obs_df, mdl_id = glbMdlFnlNslId, rsp_var =
## glb_rsp_var, : Using default probability threshold: 0.5
## Warning in glb_get_predictions(obs_df, mdl_id = glbMdlFnlRslId, rsp_var =
## glb_rsp_var, : Using default probability threshold: 0.5
## Warning in glb_get_predictions(obs_df, mdl_id = glbMdlFnlNslId, rsp_var =
## glb_rsp_var, : Using default probability threshold: 0.5
## Warning in glb_get_predictions(obs_df, mdl_id = glbMdlFnlRslId, rsp_var =
## glb_rsp_var, : Using default probability threshold: 0.5
## Warning in glb_analytics_diag_plots(obs_df = glbObsNew, mdl_id =
## glbMdlFnlNslId, : Limiting important feature scatter plots to 5 out of 14
## Warning: Removed 3229 rows containing missing values (geom_point).
## Warning: Removed 3229 rows containing missing values (geom_point).
## Warning: Removed 3229 rows containing missing values (geom_point).
## Warning: Removed 3229 rows containing missing values (geom_point).
## Warning: Removed 3229 rows containing missing values (geom_point).
## Warning: Removed 3229 rows containing missing values (geom_point).
## Warning: Removed 3229 rows containing missing values (geom_point).
## Warning: Removed 3229 rows containing missing values (geom_point).
## Warning: Removed 3229 rows containing missing values (geom_point).
## Warning: Removed 3229 rows containing missing values (geom_point).
## NULL
## Loading required package: tidyr
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:Matrix':
##
## expand
## The following object is masked from 'package:reshape2':
##
## smiths
## [1] "OOBobs CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet N: max > max of Train range: 2"
## ComplaintID CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet
## 124545 145756 N
## 606385 2094668 N
## ComplaintID.1 Datereceived.last2.log1p
## 124545 145756 12.46536
## 606385 2094668 0.00000
## Datesenttocompany.last2.log1p
## 124545 12.46536
## 606385 0.00000
## id cor.y
## ComplaintID ComplaintID 0.04156423
## Datereceived.last2.log1p Datereceived.last2.log1p 0.04308620
## Datesenttocompany.last2.log1p Datesenttocompany.last2.log1p 0.04308620
## exclude.as.feat cor.y.abs
## ComplaintID FALSE 0.04156423
## Datereceived.last2.log1p FALSE 0.04308620
## Datesenttocompany.last2.log1p FALSE 0.04308620
## cor.high.X freqRatio
## ComplaintID <NA> 1.00000
## Datereceived.last2.log1p <NA> 11.92918
## Datesenttocompany.last2.log1p Datereceived.last2.log1p 11.92918
## percentUnique zeroVar nzv is.cor.y.abs.low
## ComplaintID 1.000000e+02 FALSE FALSE FALSE
## Datereceived.last2.log1p 9.358914e-03 FALSE FALSE FALSE
## Datesenttocompany.last2.log1p 9.358914e-03 FALSE FALSE FALSE
## interaction.feat shapiro.test.p.value
## ComplaintID NA 4.528050e-40
## Datereceived.last2.log1p NA 1.066566e-87
## Datesenttocompany.last2.log1p NA 9.142582e-88
## rsp_var_raw id_var rsp_var max min
## ComplaintID FALSE TRUE NA 2.155390e+06 82
## Datereceived.last2.log1p FALSE NA NA 1.246536e+01 0
## Datesenttocompany.last2.log1p FALSE NA NA 1.246536e+01 0
## max.CDisputed.fctr.N max.CDisputed.fctr.Y
## ComplaintID 2.05392e+06 2146384.0000
## Datereceived.last2.log1p 1.20599e+01 12.0599
## Datesenttocompany.last2.log1p 1.20599e+01 12.0599
## min.CDisputed.fctr.N min.CDisputed.fctr.Y
## ComplaintID 82 115
## Datereceived.last2.log1p 0 0
## Datesenttocompany.last2.log1p 0 0
## max.CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.N
## ComplaintID 2.094668e+06
## Datereceived.last2.log1p 1.246536e+01
## Datesenttocompany.last2.log1p 1.246536e+01
## max.CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.Y
## ComplaintID 2.143124e+06
## Datereceived.last2.log1p 1.136675e+01
## Datesenttocompany.last2.log1p 1.136675e+01
## min.CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.N
## ComplaintID 359
## Datereceived.last2.log1p 0
## Datesenttocompany.last2.log1p 0
## min.CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.Y
## ComplaintID 1852271
## Datereceived.last2.log1p 0
## Datesenttocompany.last2.log1p 0
## max.CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.N
## ComplaintID 2.152348e+06
## Datereceived.last2.log1p 1.136675e+01
## Datesenttocompany.last2.log1p 1.136675e+01
## max.CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.Y
## ComplaintID 2.15539e+06
## Datereceived.last2.log1p 1.20599e+01
## Datesenttocompany.last2.log1p 1.20599e+01
## min.CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.N
## ComplaintID 29578
## Datereceived.last2.log1p 0
## Datesenttocompany.last2.log1p 0
## min.CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.Y
## ComplaintID 2046184
## Datereceived.last2.log1p 0
## Datesenttocompany.last2.log1p 0
## [1] "OOBobs total range outliers: 2"
## [1] "newobs CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet N: max > max of Train range: 2855"
## ComplaintID CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet .pos
## 607463 557120 N 607463
## 607486 628095 N 607486
## 607520 616592 N 607520
## 607773 782013 N 607773
## 607884 810845 N 607884
## 608152 928538 N 608152
## ComplaintID.1
## 607463 557120
## 607486 628095
## 607520 616592
## 607773 782013
## 607884 810845
## 608152 928538
## ComplaintID CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet .pos
## 610433 1640934 N 610433
## 623584 2076061 N 623584
## 629718 2024230 N 629718
## 630528 2097348 N 630528
## 632035 2098716 N 632035
## 636720 2125253 N 636720
## ComplaintID.1
## 610433 1640934
## 623584 2076061
## 629718 2024230
## 630528 2097348
## 632035 2098716
## 636720 2125253
## ComplaintID CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet .pos
## 646843 2138485 N 646843
## 646851 2141418 N 646851
## 646874 2149415 N 646874
## 646877 2144130 N 646877
## 646907 2013071 N 646907
## 646909 2128348 N 646909
## ComplaintID.1
## 646843 2138485
## 646851 2141418
## 646874 2149415
## 646877 2144130
## 646907 2013071
## 646909 2128348
## id cor.y exclude.as.feat cor.y.abs cor.high.X
## .pos .pos 0.01952133 FALSE 0.01952133 <NA>
## ComplaintID ComplaintID 0.04156423 FALSE 0.04156423 <NA>
## freqRatio percentUnique zeroVar nzv is.cor.y.abs.low
## .pos 1 100 FALSE FALSE FALSE
## ComplaintID 1 100 FALSE FALSE FALSE
## interaction.feat shapiro.test.p.value rsp_var_raw id_var
## .pos NA 2.994698e-39 FALSE NA
## ComplaintID NA 4.528050e-40 FALSE TRUE
## rsp_var max min max.CDisputed.fctr.N max.CDisputed.fctr.Y
## .pos NA 646909 1 607197 607394
## ComplaintID NA 2155390 82 2053920 2146384
## min.CDisputed.fctr.N min.CDisputed.fctr.Y
## .pos 1 34
## ComplaintID 82 115
## max.CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.N
## .pos 607147
## ComplaintID 2094668
## max.CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.Y
## .pos 607392
## ComplaintID 2143124
## min.CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.N
## .pos 29
## ComplaintID 359
## min.CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.Y
## .pos 101946
## ComplaintID 1852271
## max.CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.N
## .pos 646909
## ComplaintID 2152348
## max.CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.Y
## .pos 646615
## ComplaintID 2155390
## min.CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.N
## .pos 607463
## ComplaintID 29578
## min.CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.Y
## .pos 614688
## ComplaintID 2046184
## [1] "newobs CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet Y: max > max of Train range: 374"
## ComplaintID CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet .pos
## 614688 2062368 Y 614688
## 614741 2074179 Y 614741
## 614763 2046184 Y 614763
## 614787 2052071 Y 614787
## 614878 2052764 Y 614878
## 614919 2069041 Y 614919
## ComplaintID.1 Datereceived.last16.log1p Datereceived.last32.log1p
## 614688 2062368 11.36675 11.36675
## 614741 2074179 11.36675 11.36675
## 614763 2046184 11.36675 11.36675
## 614787 2052071 11.36675 12.05990
## 614878 2052764 11.36675 11.36675
## 614919 2069041 11.36675 11.36675
## Datereceived.last8.log1p Datesenttocompany.last16.log1p
## 614688 11.36675 11.36675
## 614741 0.00000 11.36675
## 614763 0.00000 11.36675
## 614787 0.00000 11.36675
## 614878 0.00000 11.36675
## 614919 0.00000 11.36675
## Datesenttocompany.last32.log1p Datesenttocompany.last8.log1p
## 614688 11.36675 11.36675
## 614741 11.36675 0.00000
## 614763 11.36675 0.00000
## 614787 12.05990 0.00000
## 614878 11.36675 0.00000
## 614919 11.36675 0.00000
## ComplaintID CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet .pos
## 618693 2052112 Y 618693
## 620320 2086972 Y 620320
## 621513 2049791 Y 621513
## 622998 2097011 Y 622998
## 623704 2081676 Y 623704
## 641223 2130564 Y 641223
## ComplaintID.1 Datereceived.last16.log1p Datereceived.last32.log1p
## 618693 2052112 11.36675 12.05990
## 620320 2086972 11.36675 12.05990
## 621513 2049791 11.36675 11.36675
## 622998 2097011 11.36675 11.36675
## 623704 2081676 11.36675 11.36675
## 641223 2130564 11.36675 12.05990
## Datereceived.last8.log1p Datesenttocompany.last16.log1p
## 618693 0.00000 11.36675
## 620320 0.00000 11.36675
## 621513 11.36675 11.36675
## 622998 11.36675 11.36675
## 623704 0.00000 11.36675
## 641223 11.36675 11.36675
## Datesenttocompany.last32.log1p Datesenttocompany.last8.log1p
## 618693 12.05990 0.00000
## 620320 12.05990 0.00000
## 621513 11.36675 11.36675
## 622998 11.36675 11.36675
## 623704 11.36675 0.00000
## 641223 12.05990 11.36675
## ComplaintID CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet .pos
## 644622 2153079 Y 644622
## 645003 2153094 Y 645003
## 645990 2149756 Y 645990
## 646318 2155330 Y 646318
## 646475 2152883 Y 646475
## 646615 2155390 Y 646615
## ComplaintID.1 Datereceived.last16.log1p Datereceived.last32.log1p
## 644622 2153079 12.75304 12.97618
## 645003 2153094 12.75304 13.15850
## 645990 2149756 12.05990 12.46536
## 646318 2155330 12.97618 13.31265
## 646475 2152883 12.75304 12.97618
## 646615 2155390 12.97618 13.31265
## Datereceived.last8.log1p Datesenttocompany.last16.log1p
## 644622 12.46536 12.75304
## 645003 12.46536 12.75304
## 645990 11.36675 12.05990
## 646318 12.75304 12.97618
## 646475 12.05990 12.75304
## 646615 12.75304 12.97618
## Datesenttocompany.last32.log1p Datesenttocompany.last8.log1p
## 644622 12.97618 12.46536
## 645003 13.15850 12.46536
## 645990 12.46536 11.36675
## 646318 13.31265 12.75304
## 646475 12.97618 12.05990
## 646615 13.31265 12.75304
## id cor.y
## .pos .pos 0.01952133
## ComplaintID ComplaintID 0.04156423
## Datereceived.last16.log1p Datereceived.last16.log1p 0.07886947
## Datereceived.last32.log1p Datereceived.last32.log1p 0.05466152
## Datereceived.last8.log1p Datereceived.last8.log1p 0.07852619
## Datesenttocompany.last16.log1p Datesenttocompany.last16.log1p 0.07886947
## Datesenttocompany.last32.log1p Datesenttocompany.last32.log1p 0.05466152
## Datesenttocompany.last8.log1p Datesenttocompany.last8.log1p 0.07852619
## exclude.as.feat cor.y.abs
## .pos FALSE 0.01952133
## ComplaintID FALSE 0.04156423
## Datereceived.last16.log1p FALSE 0.07886947
## Datereceived.last32.log1p FALSE 0.05466152
## Datereceived.last8.log1p FALSE 0.07852619
## Datesenttocompany.last16.log1p FALSE 0.07886947
## Datesenttocompany.last32.log1p FALSE 0.05466152
## Datesenttocompany.last8.log1p FALSE 0.07852619
## cor.high.X freqRatio
## .pos <NA> 1.000000
## ComplaintID <NA> 1.000000
## Datereceived.last16.log1p <NA> 1.018160
## Datereceived.last32.log1p <NA> 2.963513
## Datereceived.last8.log1p <NA> 2.536909
## Datesenttocompany.last16.log1p Datereceived.last16.log1p 1.018160
## Datesenttocompany.last32.log1p Datereceived.last32.log1p 2.963513
## Datesenttocompany.last8.log1p Datereceived.last8.log1p 2.536909
## percentUnique zeroVar nzv
## .pos 100.00000000 FALSE FALSE
## ComplaintID 100.00000000 FALSE FALSE
## Datereceived.last16.log1p 0.01403837 FALSE FALSE
## Datereceived.last32.log1p 0.01637810 FALSE FALSE
## Datereceived.last8.log1p 0.01169864 FALSE FALSE
## Datesenttocompany.last16.log1p 0.01403837 FALSE FALSE
## Datesenttocompany.last32.log1p 0.01637810 FALSE FALSE
## Datesenttocompany.last8.log1p 0.01169864 FALSE FALSE
## is.cor.y.abs.low interaction.feat
## .pos FALSE NA
## ComplaintID FALSE NA
## Datereceived.last16.log1p FALSE NA
## Datereceived.last32.log1p FALSE NA
## Datereceived.last8.log1p FALSE NA
## Datesenttocompany.last16.log1p FALSE NA
## Datesenttocompany.last32.log1p FALSE NA
## Datesenttocompany.last8.log1p FALSE NA
## shapiro.test.p.value rsp_var_raw id_var
## .pos 2.994698e-39 FALSE NA
## ComplaintID 4.528050e-40 FALSE TRUE
## Datereceived.last16.log1p 2.288977e-72 FALSE NA
## Datereceived.last32.log1p 1.910489e-78 FALSE NA
## Datereceived.last8.log1p 3.713216e-76 FALSE NA
## Datesenttocompany.last16.log1p 2.391280e-72 FALSE NA
## Datesenttocompany.last32.log1p 1.377742e-78 FALSE NA
## Datesenttocompany.last8.log1p 2.876566e-76 FALSE NA
## rsp_var max min
## .pos NA 6.469090e+05 1
## ComplaintID NA 2.155390e+06 82
## Datereceived.last16.log1p NA 1.297618e+01 0
## Datereceived.last32.log1p NA 1.331265e+01 0
## Datereceived.last8.log1p NA 1.275304e+01 0
## Datesenttocompany.last16.log1p NA 1.297618e+01 0
## Datesenttocompany.last32.log1p NA 1.331265e+01 0
## Datesenttocompany.last8.log1p NA 1.275304e+01 0
## max.CDisputed.fctr.N max.CDisputed.fctr.Y
## .pos 6.071970e+05 6.073940e+05
## ComplaintID 2.053920e+06 2.146384e+06
## Datereceived.last16.log1p 1.297618e+01 1.275304e+01
## Datereceived.last32.log1p 1.315850e+01 1.315850e+01
## Datereceived.last8.log1p 1.275304e+01 1.246536e+01
## Datesenttocompany.last16.log1p 1.297618e+01 1.275304e+01
## Datesenttocompany.last32.log1p 1.315850e+01 1.315850e+01
## Datesenttocompany.last8.log1p 1.275304e+01 1.246536e+01
## min.CDisputed.fctr.N min.CDisputed.fctr.Y
## .pos 1 34
## ComplaintID 82 115
## Datereceived.last16.log1p 0 0
## Datereceived.last32.log1p 0 0
## Datereceived.last8.log1p 0 0
## Datesenttocompany.last16.log1p 0 0
## Datesenttocompany.last32.log1p 0 0
## Datesenttocompany.last8.log1p 0 0
## max.CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.N
## .pos 6.071470e+05
## ComplaintID 2.094668e+06
## Datereceived.last16.log1p 1.275304e+01
## Datereceived.last32.log1p 1.315850e+01
## Datereceived.last8.log1p 1.275304e+01
## Datesenttocompany.last16.log1p 1.275304e+01
## Datesenttocompany.last32.log1p 1.315850e+01
## Datesenttocompany.last8.log1p 1.275304e+01
## max.CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.Y
## .pos 6.073920e+05
## ComplaintID 2.143124e+06
## Datereceived.last16.log1p 1.246536e+01
## Datereceived.last32.log1p 1.246536e+01
## Datereceived.last8.log1p 1.205990e+01
## Datesenttocompany.last16.log1p 1.246536e+01
## Datesenttocompany.last32.log1p 1.246536e+01
## Datesenttocompany.last8.log1p 1.205990e+01
## min.CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.N
## .pos 29
## ComplaintID 359
## Datereceived.last16.log1p 0
## Datereceived.last32.log1p 0
## Datereceived.last8.log1p 0
## Datesenttocompany.last16.log1p 0
## Datesenttocompany.last32.log1p 0
## Datesenttocompany.last8.log1p 0
## min.CDisputed.fctr.Interact.High.cor.Y..rcv.glmnet.Y
## .pos 1.019460e+05
## ComplaintID 1.852271e+06
## Datereceived.last16.log1p 1.136675e+01
## Datereceived.last32.log1p 1.136675e+01
## Datereceived.last8.log1p 0.000000e+00
## Datesenttocompany.last16.log1p 1.136675e+01
## Datesenttocompany.last32.log1p 1.136675e+01
## Datesenttocompany.last8.log1p 0.000000e+00
## max.CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.N
## .pos 6.469090e+05
## ComplaintID 2.152348e+06
## Datereceived.last16.log1p 1.205990e+01
## Datereceived.last32.log1p 1.246536e+01
## Datereceived.last8.log1p 1.136675e+01
## Datesenttocompany.last16.log1p 1.205990e+01
## Datesenttocompany.last32.log1p 1.246536e+01
## Datesenttocompany.last8.log1p 1.136675e+01
## max.CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.Y
## .pos 6.466150e+05
## ComplaintID 2.155390e+06
## Datereceived.last16.log1p 1.297618e+01
## Datereceived.last32.log1p 1.331265e+01
## Datereceived.last8.log1p 1.275304e+01
## Datesenttocompany.last16.log1p 1.297618e+01
## Datesenttocompany.last32.log1p 1.331265e+01
## Datesenttocompany.last8.log1p 1.275304e+01
## min.CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.N
## .pos 607463
## ComplaintID 29578
## Datereceived.last16.log1p 0
## Datereceived.last32.log1p 0
## Datereceived.last8.log1p 0
## Datesenttocompany.last16.log1p 0
## Datesenttocompany.last32.log1p 0
## Datesenttocompany.last8.log1p 0
## min.CDisputed.fctr.Trn.Interact.High.cor.Y...glmnet.Y
## .pos 6.146880e+05
## ComplaintID 2.046184e+06
## Datereceived.last16.log1p 1.136675e+01
## Datereceived.last32.log1p 1.136675e+01
## Datereceived.last8.log1p 0.000000e+00
## Datesenttocompany.last16.log1p 1.136675e+01
## Datesenttocompany.last32.log1p 1.136675e+01
## Datesenttocompany.last8.log1p 0.000000e+00
## [1] "newobs total range outliers: 3229"
## [1] 0.5
## [1] "glbMdlSltId: Interact.High.cor.Y##rcv#glmnet"
## [1] "glbMdlFnlNslId: Trn.Interact.High.cor.Y###glmnet"
## [1] "Cross Validation issues:"
## Warning in glbgetDisplayModelsDf(): Cross Validation issues:
## MFO###myMFO_classfr Random###myrandom_classfr
## 0 0
## Max.cor.Y.rcv.1X1###glmnet Max.cor.Y##rcv#rpart
## 0 1
## Trn.Interact.High.cor.Y###glmnet
## 0
## max.Accuracy.OOB max.AUCROCR.OOB
## Interact.High.cor.Y##rcv#glmnet 0.7927000 0.5771830
## All.X##rcv#glm 0.7886055 0.6057013
## Low.cor.X##rcv#glmnet 0.7884885 0.6064605
## All.X##rcv#glmnet 0.7881376 0.6057797
## Max.cor.Y.Time.Lag##rcv#glmnet 0.7743332 0.5583842
## Max.cor.Y.rcv.1X1###glmnet 0.7743332 0.5520402
## Random###myrandom_classfr 0.7743332 0.5080821
## MFO###myMFO_classfr 0.7743332 0.5000000
## Max.cor.Y##rcv#rpart 0.7743332 0.5000000
## Trn.Interact.High.cor.Y###glmnet NA NA
## Trn.Interact.High.cor.Y##rcv#glmnet NA NA
## max.AUCpROC.OOB
## Interact.High.cor.Y##rcv#glmnet 0.5447352
## All.X##rcv#glm 0.5301651
## Low.cor.X##rcv#glmnet 0.5289447
## All.X##rcv#glmnet 0.5272813
## Max.cor.Y.Time.Lag##rcv#glmnet 0.5000000
## Max.cor.Y.rcv.1X1###glmnet 0.5000000
## Random###myrandom_classfr 0.4927907
## MFO###myMFO_classfr 0.5000000
## Max.cor.Y##rcv#rpart 0.5000000
## Trn.Interact.High.cor.Y###glmnet NA
## Trn.Interact.High.cor.Y##rcv#glmnet NA
## min.elapsedtime.everything
## Interact.High.cor.Y##rcv#glmnet 61.686
## All.X##rcv#glm 43.953
## Low.cor.X##rcv#glmnet 81.479
## All.X##rcv#glmnet 103.080
## Max.cor.Y.Time.Lag##rcv#glmnet 9.539
## Max.cor.Y.rcv.1X1###glmnet 0.752
## Random###myrandom_classfr 0.301
## MFO###myMFO_classfr 0.296
## Max.cor.Y##rcv#rpart 2.376
## Trn.Interact.High.cor.Y###glmnet 2.846
## Trn.Interact.High.cor.Y##rcv#glmnet 79.604
## max.Accuracy.fit
## Interact.High.cor.Y##rcv#glmnet 0.7919981
## All.X##rcv#glm 0.7868897
## Low.cor.X##rcv#glmnet 0.7864315
## All.X##rcv#glmnet 0.7863730
## Max.cor.Y.Time.Lag##rcv#glmnet 0.7743332
## Max.cor.Y.rcv.1X1###glmnet 0.7743332
## Random###myrandom_classfr 0.7743332
## MFO###myMFO_classfr 0.7743332
## Max.cor.Y##rcv#rpart 0.7743332
## Trn.Interact.High.cor.Y###glmnet 0.7926299
## Trn.Interact.High.cor.Y##rcv#glmnet 0.7920995
## opt.prob.threshold.fit
## Interact.High.cor.Y##rcv#glmnet 0.50
## All.X##rcv#glm 0.45
## Low.cor.X##rcv#glmnet 0.45
## All.X##rcv#glmnet 0.45
## Max.cor.Y.Time.Lag##rcv#glmnet 0.50
## Max.cor.Y.rcv.1X1###glmnet 0.50
## Random###myrandom_classfr 0.80
## MFO###myMFO_classfr 0.50
## Max.cor.Y##rcv#rpart 0.50
## Trn.Interact.High.cor.Y###glmnet 0.50
## Trn.Interact.High.cor.Y##rcv#glmnet 0.50
## opt.prob.threshold.OOB
## Interact.High.cor.Y##rcv#glmnet 0.50
## All.X##rcv#glm 0.45
## Low.cor.X##rcv#glmnet 0.45
## All.X##rcv#glmnet 0.45
## Max.cor.Y.Time.Lag##rcv#glmnet 0.50
## Max.cor.Y.rcv.1X1###glmnet 0.50
## Random###myrandom_classfr 0.80
## MFO###myMFO_classfr 0.50
## Max.cor.Y##rcv#rpart 0.50
## Trn.Interact.High.cor.Y###glmnet NA
## Trn.Interact.High.cor.Y##rcv#glmnet NA
## [1] "Interact.High.cor.Y##rcv#glmnet OOB confusion matrix & accuracy: "
## Prediction
## Reference N Y
## N 6597 22
## Y 1750 179
## err.abs.fit.sum err.abs.OOB.sum err.abs.trn.sum err.abs.new.sum
## .dummy 11473.35 2868.779 14341.63 NA
## .freqRatio.Fit .freqRatio.OOB .freqRatio.Tst .n.Fit .n.New.N
## .dummy 1 1 1 34192 2855
## .n.New.Y .n.OOB .n.Trn.N .n.Trn.Y .n.Tst .n.fit .n.new .n.trn
## .dummy 374 8548 33095 9645 3229 34192 3229 42740
## err.abs.OOB.mean err.abs.fit.mean err.abs.new.mean err.abs.trn.mean
## .dummy 0.3356082 0.3355567 NA 0.3355552
## err.abs.fit.sum err.abs.OOB.sum err.abs.trn.sum err.abs.new.sum
## 1.147335e+04 2.868779e+03 1.434163e+04 NA
## .freqRatio.Fit .freqRatio.OOB .freqRatio.Tst .n.Fit
## 1.000000e+00 1.000000e+00 1.000000e+00 3.419200e+04
## .n.New.N .n.New.Y .n.OOB .n.Trn.N
## 2.855000e+03 3.740000e+02 8.548000e+03 3.309500e+04
## .n.Trn.Y .n.Tst .n.fit .n.new
## 9.645000e+03 3.229000e+03 3.419200e+04 3.229000e+03
## .n.trn err.abs.OOB.mean err.abs.fit.mean err.abs.new.mean
## 4.274000e+04 3.356082e-01 3.355567e-01 NA
## err.abs.trn.mean
## 3.355552e-01
## [1] "Features Importance for selected models:"
## Interact.High.cor.Y..rcv.glmnet.imp
## Datereceived.last16.log1p:Datereceived.year.fctr2016 100.000000
## Datereceived.last16.log1p:Datereceived.month.fctr08 70.695220
## Datereceived.last16.log1p:Datereceived.month.fctr09 54.163816
## Datereceived.last16.log1p:Datereceived.month.fctr07 46.448422
## Datereceived.last16.log1p:Datereceived.month.fctr06 33.832242
## Datereceived.last16.log1p:Datereceived.year.fctr2015 31.296530
## Datereceived.last8.log1p 29.181887
## Datereceived.last16.log1p:Datereceived.year.fctr2014 28.855880
## Datereceived.last16.log1p:Datereceived.month.fctr05 27.952870
## Datereceived.last16.log1p:Datereceived.month.fctr12 27.008194
## Datereceived.last16.log1p:Datereceived.wkend 22.428585
## Datereceived.last16.log1p:Datereceived.wkday.fctr2 21.330782
## Datereceived.last16.log1p:Datereceived.month.fctr02 17.207149
## Datereceived.last16.log1p:Datereceived.date.fctr(25,31] 16.291716
## Datereceived.last16.log1p:Datereceived.date.fctr(19,25] 10.066838
## Datereceived.last16.log1p:Datereceived.month.fctr11 4.863029
## Trn.Interact.High.cor.Y...glmnet.imp
## Datereceived.last16.log1p:Datereceived.year.fctr2016 100.000000
## Datereceived.last16.log1p:Datereceived.month.fctr08 64.339878
## Datereceived.last16.log1p:Datereceived.month.fctr09 50.657108
## Datereceived.last16.log1p:Datereceived.month.fctr07 40.880867
## Datereceived.last16.log1p:Datereceived.month.fctr06 31.086060
## Datereceived.last16.log1p:Datereceived.year.fctr2015 22.069711
## Datereceived.last8.log1p 28.122200
## Datereceived.last16.log1p:Datereceived.year.fctr2014 22.724945
## Datereceived.last16.log1p:Datereceived.month.fctr05 24.401346
## Datereceived.last16.log1p:Datereceived.month.fctr12 28.879889
## Datereceived.last16.log1p:Datereceived.wkend 22.675852
## Datereceived.last16.log1p:Datereceived.wkday.fctr2 18.433962
## Datereceived.last16.log1p:Datereceived.month.fctr02 11.028495
## Datereceived.last16.log1p:Datereceived.date.fctr(25,31] 9.383011
## Datereceived.last16.log1p:Datereceived.date.fctr(19,25] 5.605530
## Datereceived.last16.log1p:Datereceived.month.fctr11 10.240113
## [1] "glbObsNew prediction stats:"
##
## N Y
## 2855 374
## label step_major step_minor label_minor bgn end
## 8 predict.data.new 4 0 0 2753.944 2781.822
## 9 display.session.info 5 0 0 2781.822 NA
## elapsed
## 8 27.878
## 9 NA
Null Hypothesis (\(\sf{H_{0}}\)): mpg is not impacted by am_fctr.
The variance by am_fctr appears to be independent. #{r q1, cache=FALSE} # print(t.test(subset(cars_df, am_fctr == "automatic")$mpg, # subset(cars_df, am_fctr == "manual")$mpg, # var.equal=FALSE)$conf) # We reject the null hypothesis i.e. we have evidence to conclude that am_fctr impacts mpg (95% confidence). Manual transmission is better for miles per gallon versus automatic transmission.
## label step_major step_minor label_minor bgn end
## 6 fit.data.training 3 0 0 640.973 2725.819
## 2 fit.models 2 0 0 22.460 374.899
## 3 fit.models 2 1 1 374.899 619.226
## 7 fit.data.training 3 1 1 2725.819 2753.943
## 8 predict.data.new 4 0 0 2753.944 2781.822
## 1 select.features 1 0 0 6.502 22.459
## 4 fit.models 2 2 2 619.227 635.005
## 5 fit.models 2 3 3 635.005 640.973
## elapsed duration
## 6 2084.846 2084.846
## 2 352.439 352.439
## 3 244.328 244.327
## 7 28.124 28.124
## 8 27.878 27.878
## 1 15.957 15.957
## 4 15.778 15.778
## 5 5.968 5.968
## [1] "Total Elapsed Time: 2,781.822 secs"